diff --git a/core/math/math_funcs.h b/core/math/math_funcs.h index 267f6a4fe23..8cf13efdb61 100644 --- a/core/math/math_funcs.h +++ b/core/math/math_funcs.h @@ -103,6 +103,9 @@ public: static _ALWAYS_INLINE_ double log(double p_x) { return ::log(p_x); } static _ALWAYS_INLINE_ float log(float p_x) { return ::logf(p_x); } + static _ALWAYS_INLINE_ double log2(double p_x) { return ::log2(p_x); } + static _ALWAYS_INLINE_ float log2(float p_x) { return ::log2f(p_x); } + static _ALWAYS_INLINE_ double exp(double p_x) { return ::exp(p_x); } static _ALWAYS_INLINE_ float exp(float p_x) { return ::expf(p_x); } diff --git a/doc/classes/GeometryInstance3D.xml b/doc/classes/GeometryInstance3D.xml index 631a30ababb..b2c3bfc3ed0 100644 --- a/doc/classes/GeometryInstance3D.xml +++ b/doc/classes/GeometryInstance3D.xml @@ -48,6 +48,8 @@ + + diff --git a/doc/classes/Occluder3D.xml b/doc/classes/Occluder3D.xml new file mode 100644 index 00000000000..fc676c2b490 --- /dev/null +++ b/doc/classes/Occluder3D.xml @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + diff --git a/doc/classes/OccluderInstance3D.xml b/doc/classes/OccluderInstance3D.xml new file mode 100644 index 00000000000..76b784d21d6 --- /dev/null +++ b/doc/classes/OccluderInstance3D.xml @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml index 2bfe7ad48fb..e090e20d9ff 100644 --- a/doc/classes/ProjectSettings.xml +++ b/doc/classes/ProjectSettings.xml @@ -1469,6 +1469,12 @@ + + + + + + Number of cubemaps to store in the reflection atlas. The number of [ReflectionProbe]s in a scene will be limited by this amount. A higher number requires more VRAM. diff --git a/doc/classes/RenderingServer.xml b/doc/classes/RenderingServer.xml index d6eaa1b88b5..638b0bb2973 100644 --- a/doc/classes/RenderingServer.xml +++ b/doc/classes/RenderingServer.xml @@ -1999,6 +1999,24 @@ Sets the number of instances visible at a given time. If -1, all instances that have been allocated are drawn. Equivalent to [member MultiMesh.visible_instance_count]. + + + + + + + + + + + + + + + + + + @@ -2412,6 +2430,16 @@ The scenario is the 3D world that all the visual instances exist in. + + + + + + + + + + @@ -2897,6 +2925,22 @@ Sets the anti-aliasing mode. See [enum ViewportMSAA] for options. + + + + + + + + + + + + + + + + @@ -3002,6 +3046,16 @@ + + + + + + + + + + @@ -3454,6 +3508,8 @@ + + Uses high quality importance sampling to process the radiance map. In general, this results in much higher quality than [constant Sky.PROCESS_MODE_REALTIME] but takes much longer to generate. This should not be used if you plan on changing the sky at runtime. If you are finding that the reflection is not blurry enough and is showing sparkles or fireflies, try increasing [member ProjectSettings.rendering/reflections/sky_reflections/ggx_samples]. @@ -3606,6 +3662,12 @@ Draw all objects without shading. Equivalent to setting all objects shaders to [code]unshaded[/code]. + + + + + + The instance does not have a type. @@ -3638,7 +3700,9 @@ The instance is a lightmap. - + + + Represents the size of the [enum InstanceType] enum. @@ -3653,7 +3717,9 @@ When set, manually requests to draw geometry on next frame. - + + + Represents the size of the [enum InstanceFlags] enum. diff --git a/doc/classes/Viewport.xml b/doc/classes/Viewport.xml index 471d21374d0..cce57053798 100644 --- a/doc/classes/Viewport.xml +++ b/doc/classes/Viewport.xml @@ -267,6 +267,8 @@ + + The custom [World2D] which can be used as 2D environment source. @@ -419,6 +421,8 @@ + + The texture filter reads from the nearest pixel only. The simplest and fastest method of filtering, but the texture will look pixelized. diff --git a/drivers/dummy/rasterizer_dummy.h b/drivers/dummy/rasterizer_dummy.h index 9d6be1a802f..f9a76d16033 100644 --- a/drivers/dummy/rasterizer_dummy.h +++ b/drivers/dummy/rasterizer_dummy.h @@ -548,6 +548,12 @@ public: void lightmap_set_probe_capture_update_speed(float p_speed) override {} float lightmap_get_probe_capture_update_speed() const override { return 0; } + /* OCCLUDER */ + + RID occluder_allocate() override { return RID(); } + void occluder_initialize(RID p_rid) override {} + void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) {} + /* PARTICLES */ RID particles_allocate() override { return RID(); } diff --git a/editor/editor_node.cpp b/editor/editor_node.cpp index 055baeb81e4..61376175649 100644 --- a/editor/editor_node.cpp +++ b/editor/editor_node.cpp @@ -143,6 +143,7 @@ #include "editor/plugins/multimesh_editor_plugin.h" #include "editor/plugins/navigation_polygon_editor_plugin.h" #include "editor/plugins/node_3d_editor_plugin.h" +#include "editor/plugins/occluder_instance_3d_editor_plugin.h" #include "editor/plugins/ot_features_plugin.h" #include "editor/plugins/packed_scene_translation_parser_plugin.h" #include "editor/plugins/path_2d_editor_plugin.h" @@ -6800,6 +6801,7 @@ EditorNode::EditorNode() { add_editor_plugin(memnew(TextureRegionEditorPlugin(this))); add_editor_plugin(memnew(GIProbeEditorPlugin(this))); add_editor_plugin(memnew(BakedLightmapEditorPlugin(this))); + add_editor_plugin(memnew(OccluderInstance3DEditorPlugin(this))); add_editor_plugin(memnew(Path2DEditorPlugin(this))); add_editor_plugin(memnew(Path3DEditorPlugin(this))); add_editor_plugin(memnew(Line2DEditorPlugin(this))); diff --git a/editor/node_3d_editor_gizmos.cpp b/editor/node_3d_editor_gizmos.cpp index 7dcabafecea..afafd7d195c 100644 --- a/editor/node_3d_editor_gizmos.cpp +++ b/editor/node_3d_editor_gizmos.cpp @@ -47,6 +47,7 @@ #include "scene/3d/listener_3d.h" #include "scene/3d/mesh_instance_3d.h" #include "scene/3d/navigation_region_3d.h" +#include "scene/3d/occluder_instance_3d.h" #include "scene/3d/physics_joint_3d.h" #include "scene/3d/position_3d.h" #include "scene/3d/ray_cast_3d.h" @@ -176,6 +177,7 @@ void EditorNode3DGizmo::Instance::create_instance(Node3D *p_base, bool p_hidden) RS::get_singleton()->instance_geometry_set_cast_shadows_setting(instance, RS::SHADOW_CASTING_SETTING_OFF); int layer = p_hidden ? 0 : 1 << Node3DEditorViewport::GIZMO_EDIT_LAYER; RS::get_singleton()->instance_set_layer_mask(instance, layer); //gizmos are 26 + RS::get_singleton()->instance_geometry_set_flag(instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true); } void EditorNode3DGizmo::add_mesh(const Ref &p_mesh, bool p_billboard, const Ref &p_skin_reference, const Ref &p_material) { @@ -1464,6 +1466,44 @@ void MeshInstance3DGizmoPlugin::redraw(EditorNode3DGizmo *p_gizmo) { } ///// + +OccluderInstance3DGizmoPlugin::OccluderInstance3DGizmoPlugin() { + create_material("line_material", EDITOR_DEF("editors/3d_gizmos/gizmo_colors/occluder", Color(0.8, 0.5, 1))); +} + +bool OccluderInstance3DGizmoPlugin::has_gizmo(Node3D *p_spatial) { + return Object::cast_to(p_spatial) != nullptr; +} + +String OccluderInstance3DGizmoPlugin::get_gizmo_name() const { + return "OccluderInstance3D"; +} + +int OccluderInstance3DGizmoPlugin::get_priority() const { + return -1; +} + +void OccluderInstance3DGizmoPlugin::redraw(EditorNode3DGizmo *p_gizmo) { + OccluderInstance3D *occluder_instance = Object::cast_to(p_gizmo->get_spatial_node()); + + p_gizmo->clear(); + + Ref o = occluder_instance->get_occluder(); + + if (!o.is_valid()) { + return; + } + + Vector lines = o->get_debug_lines(); + if (!lines.is_empty()) { + Ref material = get_material("line_material", p_gizmo); + p_gizmo->add_lines(lines, material); + p_gizmo->add_collision_segments(lines); + } +} + +///// + Sprite3DGizmoPlugin::Sprite3DGizmoPlugin() { } diff --git a/editor/node_3d_editor_gizmos.h b/editor/node_3d_editor_gizmos.h index 6f98d3a08ca..95344176ade 100644 --- a/editor/node_3d_editor_gizmos.h +++ b/editor/node_3d_editor_gizmos.h @@ -100,6 +100,18 @@ public: MeshInstance3DGizmoPlugin(); }; +class OccluderInstance3DGizmoPlugin : public EditorNode3DGizmoPlugin { + GDCLASS(OccluderInstance3DGizmoPlugin, EditorNode3DGizmoPlugin); + +public: + bool has_gizmo(Node3D *p_spatial) override; + String get_gizmo_name() const override; + int get_priority() const override; + void redraw(EditorNode3DGizmo *p_gizmo) override; + + OccluderInstance3DGizmoPlugin(); +}; + class Sprite3DGizmoPlugin : public EditorNode3DGizmoPlugin { GDCLASS(Sprite3DGizmoPlugin, EditorNode3DGizmoPlugin); diff --git a/editor/plugins/node_3d_editor_plugin.cpp b/editor/plugins/node_3d_editor_plugin.cpp index 13c7814dacb..023d91be303 100644 --- a/editor/plugins/node_3d_editor_plugin.cpp +++ b/editor/plugins/node_3d_editor_plugin.cpp @@ -2368,6 +2368,9 @@ void Node3DEditorViewport::_project_settings_changed() { viewport->set_screen_space_aa(Viewport::ScreenSpaceAA(ssaa_mode)); const bool use_debanding = GLOBAL_GET("rendering/anti_aliasing/quality/use_debanding"); viewport->set_use_debanding(use_debanding); + + const bool use_occlusion_culling = GLOBAL_GET("rendering/occlusion_culling/use_occlusion_culling"); + viewport->set_use_occlusion_culling(use_occlusion_culling); } void Node3DEditorViewport::_notification(int p_what) { @@ -3071,7 +3074,8 @@ void Node3DEditorViewport::_menu_option(int p_option) { case VIEW_DISPLAY_DEBUG_CLUSTER_OMNI_LIGHTS: case VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS: case VIEW_DISPLAY_DEBUG_CLUSTER_DECALS: - case VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES: { + case VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES: + case VIEW_DISPLAY_DEBUG_OCCLUDERS: { static const int display_options[] = { VIEW_DISPLAY_NORMAL, VIEW_DISPLAY_WIREFRAME, @@ -3097,6 +3101,7 @@ void Node3DEditorViewport::_menu_option(int p_option) { VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS, VIEW_DISPLAY_DEBUG_CLUSTER_DECALS, VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES, + VIEW_DISPLAY_DEBUG_OCCLUDERS, VIEW_MAX }; static const Viewport::DebugDraw debug_draw_modes[] = { @@ -3124,6 +3129,7 @@ void Node3DEditorViewport::_menu_option(int p_option) { Viewport::DEBUG_DRAW_CLUSTER_SPOT_LIGHTS, Viewport::DEBUG_DRAW_CLUSTER_DECALS, Viewport::DEBUG_DRAW_CLUSTER_REFLECTION_PROBES, + Viewport::DEBUG_DRAW_OCCLUDERS, }; int idx = 0; @@ -3173,6 +3179,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) { RS::get_singleton()->instance_set_visible(move_gizmo_instance[i], false); RS::get_singleton()->instance_geometry_set_cast_shadows_setting(move_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF); RS::get_singleton()->instance_set_layer_mask(move_gizmo_instance[i], layer); + RS::get_singleton()->instance_geometry_set_flag(move_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true); move_plane_gizmo_instance[i] = RS::get_singleton()->instance_create(); RS::get_singleton()->instance_set_base(move_plane_gizmo_instance[i], spatial_editor->get_move_plane_gizmo(i)->get_rid()); @@ -3180,6 +3187,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) { RS::get_singleton()->instance_set_visible(move_plane_gizmo_instance[i], false); RS::get_singleton()->instance_geometry_set_cast_shadows_setting(move_plane_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF); RS::get_singleton()->instance_set_layer_mask(move_plane_gizmo_instance[i], layer); + RS::get_singleton()->instance_geometry_set_flag(move_plane_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true); rotate_gizmo_instance[i] = RS::get_singleton()->instance_create(); RS::get_singleton()->instance_set_base(rotate_gizmo_instance[i], spatial_editor->get_rotate_gizmo(i)->get_rid()); @@ -3187,6 +3195,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) { RS::get_singleton()->instance_set_visible(rotate_gizmo_instance[i], false); RS::get_singleton()->instance_geometry_set_cast_shadows_setting(rotate_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF); RS::get_singleton()->instance_set_layer_mask(rotate_gizmo_instance[i], layer); + RS::get_singleton()->instance_geometry_set_flag(rotate_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true); scale_gizmo_instance[i] = RS::get_singleton()->instance_create(); RS::get_singleton()->instance_set_base(scale_gizmo_instance[i], spatial_editor->get_scale_gizmo(i)->get_rid()); @@ -3194,6 +3203,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) { RS::get_singleton()->instance_set_visible(scale_gizmo_instance[i], false); RS::get_singleton()->instance_geometry_set_cast_shadows_setting(scale_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF); RS::get_singleton()->instance_set_layer_mask(scale_gizmo_instance[i], layer); + RS::get_singleton()->instance_geometry_set_flag(scale_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true); scale_plane_gizmo_instance[i] = RS::get_singleton()->instance_create(); RS::get_singleton()->instance_set_base(scale_plane_gizmo_instance[i], spatial_editor->get_scale_plane_gizmo(i)->get_rid()); @@ -3201,6 +3211,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) { RS::get_singleton()->instance_set_visible(scale_plane_gizmo_instance[i], false); RS::get_singleton()->instance_geometry_set_cast_shadows_setting(scale_plane_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF); RS::get_singleton()->instance_set_layer_mask(scale_plane_gizmo_instance[i], layer); + RS::get_singleton()->instance_geometry_set_flag(scale_plane_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true); } // Rotation white outline @@ -3210,6 +3221,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) { RS::get_singleton()->instance_set_visible(rotate_gizmo_instance[3], false); RS::get_singleton()->instance_geometry_set_cast_shadows_setting(rotate_gizmo_instance[3], RS::SHADOW_CASTING_SETTING_OFF); RS::get_singleton()->instance_set_layer_mask(rotate_gizmo_instance[3], layer); + RS::get_singleton()->instance_geometry_set_flag(rotate_gizmo_instance[3], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true); } void Node3DEditorViewport::_finish_gizmo_instances() { @@ -4043,6 +4055,7 @@ Node3DEditorViewport::Node3DEditorViewport(Node3DEditor *p_spatial_editor, Edito display_submenu->add_radio_check_item(TTR("Spot Light Cluster"), VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS); display_submenu->add_radio_check_item(TTR("Decal Cluster"), VIEW_DISPLAY_DEBUG_CLUSTER_DECALS); display_submenu->add_radio_check_item(TTR("Reflection Probe Cluster"), VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES); + display_submenu->add_radio_check_item(TTR("Occlusion Culling Buffer"), VIEW_DISPLAY_DEBUG_OCCLUDERS); display_submenu->set_name("display_advanced"); view_menu->get_popup()->add_submenu_item(TTR("Display Advanced..."), "display_advanced", VIEW_DISPLAY_ADVANCED); @@ -4625,6 +4638,7 @@ Object *Node3DEditor::_get_editor_data(Object *p_what) { si->sbox_instance, RS::SHADOW_CASTING_SETTING_OFF); RS::get_singleton()->instance_set_layer_mask(si->sbox_instance, 1 << Node3DEditorViewport::MISC_TOOL_LAYER); + RS::get_singleton()->instance_geometry_set_flag(si->sbox_instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true); si->sbox_instance_xray = RenderingServer::get_singleton()->instance_create2( selection_box_xray->get_rid(), sp->get_world_3d()->get_scenario()); @@ -4632,6 +4646,7 @@ Object *Node3DEditor::_get_editor_data(Object *p_what) { si->sbox_instance_xray, RS::SHADOW_CASTING_SETTING_OFF); RS::get_singleton()->instance_set_layer_mask(si->sbox_instance_xray, 1 << Node3DEditorViewport::MISC_TOOL_LAYER); + RS::get_singleton()->instance_geometry_set_flag(si->sbox_instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true); return si; } @@ -5403,6 +5418,7 @@ void Node3DEditor::_init_indicators() { origin_instance = RenderingServer::get_singleton()->instance_create2(origin, get_tree()->get_root()->get_world_3d()->get_scenario()); RS::get_singleton()->instance_set_layer_mask(origin_instance, 1 << Node3DEditorViewport::GIZMO_GRID_LAYER); + RS::get_singleton()->instance_geometry_set_flag(origin_instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true); RenderingServer::get_singleton()->instance_geometry_set_cast_shadows_setting(origin_instance, RS::SHADOW_CASTING_SETTING_OFF); } @@ -5964,6 +5980,7 @@ void Node3DEditor::_init_grid() { RenderingServer::get_singleton()->instance_set_visible(grid_instance[c], grid_visible[a]); RenderingServer::get_singleton()->instance_geometry_set_cast_shadows_setting(grid_instance[c], RS::SHADOW_CASTING_SETTING_OFF); RS::get_singleton()->instance_set_layer_mask(grid_instance[c], 1 << Node3DEditorViewport::GIZMO_GRID_LAYER); + RS::get_singleton()->instance_geometry_set_flag(grid_instance[c], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true); } } @@ -6465,6 +6482,7 @@ void Node3DEditor::_register_all_gizmos() { add_gizmo_plugin(Ref(memnew(Light3DGizmoPlugin))); add_gizmo_plugin(Ref(memnew(AudioStreamPlayer3DGizmoPlugin))); add_gizmo_plugin(Ref(memnew(MeshInstance3DGizmoPlugin))); + add_gizmo_plugin(Ref(memnew(OccluderInstance3DGizmoPlugin))); add_gizmo_plugin(Ref(memnew(SoftBody3DGizmoPlugin))); add_gizmo_plugin(Ref(memnew(Sprite3DGizmoPlugin))); add_gizmo_plugin(Ref(memnew(Skeleton3DGizmoPlugin))); @@ -7340,6 +7358,7 @@ void EditorNode3DGizmoPlugin::create_material(const String &p_name, const Color material->set_shading_mode(StandardMaterial3D::SHADING_MODE_UNSHADED); material->set_transparency(StandardMaterial3D::TRANSPARENCY_ALPHA); material->set_render_priority(StandardMaterial3D::RENDER_PRIORITY_MIN + 1); + material->set_cull_mode(StandardMaterial3D::CULL_DISABLED); if (p_use_vertex_color) { material->set_flag(StandardMaterial3D::FLAG_ALBEDO_FROM_VERTEX_COLOR, true); diff --git a/editor/plugins/node_3d_editor_plugin.h b/editor/plugins/node_3d_editor_plugin.h index 70329f90c7d..33f4c324713 100644 --- a/editor/plugins/node_3d_editor_plugin.h +++ b/editor/plugins/node_3d_editor_plugin.h @@ -221,6 +221,7 @@ class Node3DEditorViewport : public Control { VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS, VIEW_DISPLAY_DEBUG_CLUSTER_DECALS, VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES, + VIEW_DISPLAY_DEBUG_OCCLUDERS, VIEW_LOCK_ROTATION, VIEW_CINEMATIC_PREVIEW, diff --git a/editor/plugins/occluder_instance_3d_editor_plugin.cpp b/editor/plugins/occluder_instance_3d_editor_plugin.cpp new file mode 100644 index 00000000000..0821f140b37 --- /dev/null +++ b/editor/plugins/occluder_instance_3d_editor_plugin.cpp @@ -0,0 +1,117 @@ +/*************************************************************************/ +/* occluder_instance_3d_editor_plugin.cpp */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#include "occluder_instance_3d_editor_plugin.h" + +void OccluderInstance3DEditorPlugin::_bake_select_file(const String &p_file) { + if (occluder_instance) { + OccluderInstance3D::BakeError err; + if (get_tree()->get_edited_scene_root() && get_tree()->get_edited_scene_root() == occluder_instance) { + err = occluder_instance->bake(occluder_instance, p_file); + } else { + err = occluder_instance->bake(occluder_instance->get_parent(), p_file); + } + + switch (err) { + case OccluderInstance3D::BAKE_ERROR_NO_SAVE_PATH: { + String scene_path = occluder_instance->get_filename(); + if (scene_path == String()) { + scene_path = occluder_instance->get_owner()->get_filename(); + } + if (scene_path == String()) { + EditorNode::get_singleton()->show_warning(TTR("Can't determine a save path for the occluder.\nSave your scene and try again.")); + break; + } + scene_path = scene_path.get_basename() + ".occ"; + + file_dialog->set_current_path(scene_path); + file_dialog->popup_file_dialog(); + + } break; + case OccluderInstance3D::BAKE_ERROR_NO_MESHES: { + EditorNode::get_singleton()->show_warning(TTR("No meshes to bake.")); + break; + } + default: { + } + } + } +} + +void OccluderInstance3DEditorPlugin::_bake() { + _bake_select_file(""); +} + +void OccluderInstance3DEditorPlugin::edit(Object *p_object) { + OccluderInstance3D *s = Object::cast_to(p_object); + if (!s) { + return; + } + + occluder_instance = s; +} + +bool OccluderInstance3DEditorPlugin::handles(Object *p_object) const { + return p_object->is_class("OccluderInstance3D"); +} + +void OccluderInstance3DEditorPlugin::make_visible(bool p_visible) { + if (p_visible) { + bake->show(); + } else { + bake->hide(); + } +} + +void OccluderInstance3DEditorPlugin::_bind_methods() { + ClassDB::bind_method("_bake", &OccluderInstance3DEditorPlugin::_bake); +} + +OccluderInstance3DEditorPlugin::OccluderInstance3DEditorPlugin(EditorNode *p_node) { + editor = p_node; + bake = memnew(Button); + bake->set_flat(true); + bake->set_icon(editor->get_gui_base()->get_theme_icon("Bake", "EditorIcons")); + bake->set_text(TTR("Bake Occluders")); + bake->hide(); + bake->connect("pressed", Callable(this, "_bake")); + add_control_to_container(CONTAINER_SPATIAL_EDITOR_MENU, bake); + occluder_instance = nullptr; + + file_dialog = memnew(EditorFileDialog); + file_dialog->set_file_mode(EditorFileDialog::FILE_MODE_SAVE_FILE); + file_dialog->add_filter("*.occ ; Occluder3D"); + file_dialog->set_title(TTR("Select occluder bake file:")); + file_dialog->connect("file_selected", callable_mp(this, &OccluderInstance3DEditorPlugin::_bake_select_file)); + bake->add_child(file_dialog); +} + +OccluderInstance3DEditorPlugin::~OccluderInstance3DEditorPlugin() { +} diff --git a/editor/plugins/occluder_instance_3d_editor_plugin.h b/editor/plugins/occluder_instance_3d_editor_plugin.h new file mode 100644 index 00000000000..161b17811c9 --- /dev/null +++ b/editor/plugins/occluder_instance_3d_editor_plugin.h @@ -0,0 +1,66 @@ +/*************************************************************************/ +/* occluder_instance_3d_editor_plugin.h */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#ifndef OCCLUDER_INSTANCE_3D_EDITOR_PLUGIN_H +#define OCCLUDER_INSTANCE_3D_EDITOR_PLUGIN_H + +#include "editor/editor_node.h" +#include "editor/editor_plugin.h" +#include "scene/3d/occluder_instance_3d.h" +#include "scene/resources/material.h" + +class OccluderInstance3DEditorPlugin : public EditorPlugin { + GDCLASS(OccluderInstance3DEditorPlugin, EditorPlugin); + + OccluderInstance3D *occluder_instance; + + Button *bake; + EditorNode *editor; + + EditorFileDialog *file_dialog; + + void _bake_select_file(const String &p_file); + void _bake(); + +protected: + static void _bind_methods(); + +public: + virtual String get_name() const override { return "OccluderInstance3D"; } + bool has_main_screen() const override { return false; } + virtual void edit(Object *p_object) override; + virtual bool handles(Object *p_object) const override; + virtual void make_visible(bool p_visible) override; + + OccluderInstance3DEditorPlugin(EditorNode *p_node); + ~OccluderInstance3DEditorPlugin(); +}; + +#endif diff --git a/modules/raycast/SCsub b/modules/raycast/SCsub new file mode 100644 index 00000000000..68e9df52634 --- /dev/null +++ b/modules/raycast/SCsub @@ -0,0 +1,86 @@ +#!/usr/bin/env python + +Import("env") +Import("env_modules") + +embree_src = [ + "common/sys/sysinfo.cpp", + "common/sys/alloc.cpp", + "common/sys/filename.cpp", + "common/sys/library.cpp", + "common/sys/thread.cpp", + "common/sys/string.cpp", + "common/sys/regression.cpp", + "common/sys/mutex.cpp", + "common/sys/condition.cpp", + "common/sys/barrier.cpp", + "common/math/constants.cpp", + "common/simd/sse.cpp", + "common/lexers/stringstream.cpp", + "common/lexers/tokenstream.cpp", + "common/tasking/taskschedulerinternal.cpp", + "common/algorithms/parallel_for.cpp", + "common/algorithms/parallel_reduce.cpp", + "common/algorithms/parallel_prefix_sum.cpp", + "common/algorithms/parallel_for_for.cpp", + "common/algorithms/parallel_for_for_prefix_sum.cpp", + "common/algorithms/parallel_partition.cpp", + "common/algorithms/parallel_sort.cpp", + "common/algorithms/parallel_set.cpp", + "common/algorithms/parallel_map.cpp", + "common/algorithms/parallel_filter.cpp", + "kernels/common/device.cpp", + "kernels/common/stat.cpp", + "kernels/common/acceln.cpp", + "kernels/common/accelset.cpp", + "kernels/common/state.cpp", + "kernels/common/rtcore.cpp", + "kernels/common/rtcore_builder.cpp", + "kernels/common/scene.cpp", + "kernels/common/alloc.cpp", + "kernels/common/geometry.cpp", + "kernels/common/scene_triangle_mesh.cpp", + "kernels/geometry/primitive4.cpp", + "kernels/builders/primrefgen.cpp", + "kernels/bvh/bvh.cpp", + "kernels/bvh/bvh_statistics.cpp", + "kernels/bvh/bvh4_factory.cpp", + "kernels/bvh/bvh8_factory.cpp", + "kernels/bvh/bvh_collider.cpp", + "kernels/bvh/bvh_rotate.cpp", + "kernels/bvh/bvh_refit.cpp", + "kernels/bvh/bvh_builder.cpp", + "kernels/bvh/bvh_builder_morton.cpp", + "kernels/bvh/bvh_builder_sah.cpp", + "kernels/bvh/bvh_builder_sah_spatial.cpp", + "kernels/bvh/bvh_builder_sah_mb.cpp", + "kernels/bvh/bvh_builder_twolevel.cpp", + "kernels/bvh/bvh_intersector1_bvh4.cpp", +] + +embree_dir = "#thirdparty/embree-aarch64/" + +env_embree = env_modules.Clone() +embree_sources = [embree_dir + file for file in embree_src] +env_embree.Prepend(CPPPATH=[embree_dir, embree_dir + "include"]) +env_embree.Append(CPPFLAGS=["-DEMBREE_TARGET_SSE2", "-DEMBREE_LOWEST_ISA", "-DTASKING_INTERNAL", "-DNDEBUG"]) + +if not env_embree.msvc: + env_embree.Append(CPPFLAGS=["-msse2", "-mxsave"]) + if env["platform"] == "windows": + env_embree.Append(CPPFLAGS=["-mstackrealign"]) + +if env["platform"] == "windows": + if env.msvc: + env.Append(LINKFLAGS=["psapi.lib"]) + env_embree.Append(CPPFLAGS=["-D__SSE2__", "-D__SSE__"]) + else: + env.Append(LIBS=["psapi"]) + +env_embree.disable_warnings() +env_embree.add_source_files(env.modules_sources, embree_sources) + +env_raycast = env_modules.Clone() +env_raycast.Prepend(CPPPATH=[embree_dir, embree_dir + "include", embree_dir + "common"]) + +env_raycast.add_source_files(env.modules_sources, "*.cpp") diff --git a/modules/raycast/config.py b/modules/raycast/config.py new file mode 100644 index 00000000000..26493da41bf --- /dev/null +++ b/modules/raycast/config.py @@ -0,0 +1,12 @@ +def can_build(env, platform): + if platform == "android": + return env["android_arch"] in ["arm64v8", "x86", "x86_64"] + + if platform == "javascript": + return False # No SIMD support yet + + return True + + +def configure(env): + pass diff --git a/modules/raycast/godot_update_embree.py b/modules/raycast/godot_update_embree.py new file mode 100644 index 00000000000..db4fa95c21b --- /dev/null +++ b/modules/raycast/godot_update_embree.py @@ -0,0 +1,260 @@ +import glob, os, shutil, subprocess, re + +include_dirs = [ + "common/tasking", + "kernels/bvh", + "kernels/builders", + "common/sys", + "kernels", + "kernels/common", + "common/math", + "common/algorithms", + "common/lexers", + "common/simd", + "include/embree3", + "kernels/subdiv", + "kernels/geometry", +] + +cpp_files = [ + "common/sys/sysinfo.cpp", + "common/sys/alloc.cpp", + "common/sys/filename.cpp", + "common/sys/library.cpp", + "common/sys/thread.cpp", + "common/sys/string.cpp", + "common/sys/regression.cpp", + "common/sys/mutex.cpp", + "common/sys/condition.cpp", + "common/sys/barrier.cpp", + "common/math/constants.cpp", + "common/simd/sse.cpp", + "common/lexers/stringstream.cpp", + "common/lexers/tokenstream.cpp", + "common/tasking/taskschedulerinternal.cpp", + "common/algorithms/parallel_for.cpp", + "common/algorithms/parallel_reduce.cpp", + "common/algorithms/parallel_prefix_sum.cpp", + "common/algorithms/parallel_for_for.cpp", + "common/algorithms/parallel_for_for_prefix_sum.cpp", + "common/algorithms/parallel_partition.cpp", + "common/algorithms/parallel_sort.cpp", + "common/algorithms/parallel_set.cpp", + "common/algorithms/parallel_map.cpp", + "common/algorithms/parallel_filter.cpp", + "kernels/common/device.cpp", + "kernels/common/stat.cpp", + "kernels/common/acceln.cpp", + "kernels/common/accelset.cpp", + "kernels/common/state.cpp", + "kernels/common/rtcore.cpp", + "kernels/common/rtcore_builder.cpp", + "kernels/common/scene.cpp", + "kernels/common/alloc.cpp", + "kernels/common/geometry.cpp", + "kernels/common/scene_triangle_mesh.cpp", + "kernels/geometry/primitive4.cpp", + "kernels/builders/primrefgen.cpp", + "kernels/bvh/bvh.cpp", + "kernels/bvh/bvh_statistics.cpp", + "kernels/bvh/bvh4_factory.cpp", + "kernels/bvh/bvh8_factory.cpp", + "kernels/bvh/bvh_collider.cpp", + "kernels/bvh/bvh_rotate.cpp", + "kernels/bvh/bvh_refit.cpp", + "kernels/bvh/bvh_builder.cpp", + "kernels/bvh/bvh_builder_morton.cpp", + "kernels/bvh/bvh_builder_sah.cpp", + "kernels/bvh/bvh_builder_sah_spatial.cpp", + "kernels/bvh/bvh_builder_sah_mb.cpp", + "kernels/bvh/bvh_builder_twolevel.cpp", + "kernels/bvh/bvh_intersector1.cpp", + "kernels/bvh/bvh_intersector1_bvh4.cpp", +] + +os.chdir("../../thirdparty") + +dir_name = "embree-aarch64" +if os.path.exists(dir_name): + shutil.rmtree(dir_name) + +subprocess.run(["git", "clone", "https://github.com/lighttransport/embree-aarch64.git", "embree-tmp"]) +os.chdir("embree-tmp") + +commit_hash = str(subprocess.check_output(["git", "rev-parse", "HEAD"], universal_newlines=True)).strip() + +all_files = set(cpp_files) + +dest_dir = os.path.join("..", dir_name) +for include_dir in include_dirs: + headers = glob.iglob(os.path.join(include_dir, "*.h")) + all_files.update(headers) + +for f in all_files: + d = os.path.join(dest_dir, os.path.dirname(f)) + if not os.path.exists(d): + os.makedirs(d) + shutil.copy2(f, d) + +with open(os.path.join(dest_dir, "kernels/hash.h"), "w") as hash_file: + hash_file.write( + f""" +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#define RTC_HASH "{commit_hash}" +""" + ) + +with open(os.path.join(dest_dir, "kernels/config.h"), "w") as config_file: + config_file.write( + """ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +/* #undef EMBREE_RAY_MASK */ +/* #undef EMBREE_STAT_COUNTERS */ +/* #undef EMBREE_BACKFACE_CULLING */ +/* #undef EMBREE_BACKFACE_CULLING_CURVES */ +#define EMBREE_FILTER_FUNCTION +/* #undef EMBREE_IGNORE_INVALID_RAYS */ +#define EMBREE_GEOMETRY_TRIANGLE +/* #undef EMBREE_GEOMETRY_QUAD */ +/* #undef EMBREE_GEOMETRY_CURVE */ +/* #undef EMBREE_GEOMETRY_SUBDIVISION */ +/* #undef EMBREE_GEOMETRY_USER */ +/* #undef EMBREE_GEOMETRY_INSTANCE */ +/* #undef EMBREE_GEOMETRY_GRID */ +/* #undef EMBREE_GEOMETRY_POINT */ +/* #undef EMBREE_RAY_PACKETS */ +/* #undef EMBREE_COMPACT_POLYS */ + +#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0 + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + #define IF_ENABLED_TRIS(x) x +#else + #define IF_ENABLED_TRIS(x) +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + #define IF_ENABLED_QUADS(x) x +#else + #define IF_ENABLED_QUADS(x) +#endif + +#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT) + #define IF_ENABLED_CURVES_OR_POINTS(x) x +#else + #define IF_ENABLED_CURVES_OR_POINTS(x) +#endif + +#if defined(EMBREE_GEOMETRY_CURVE) + #define IF_ENABLED_CURVES(x) x +#else + #define IF_ENABLED_CURVES(x) +#endif + +#if defined(EMBREE_GEOMETRY_POINT) + #define IF_ENABLED_POINTS(x) x +#else + #define IF_ENABLED_POINTS(x) +#endif + +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + #define IF_ENABLED_SUBDIV(x) x +#else + #define IF_ENABLED_SUBDIV(x) +#endif + +#if defined(EMBREE_GEOMETRY_USER) + #define IF_ENABLED_USER(x) x +#else + #define IF_ENABLED_USER(x) +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + #define IF_ENABLED_INSTANCE(x) x +#else + #define IF_ENABLED_INSTANCE(x) +#endif + +#if defined(EMBREE_GEOMETRY_GRID) + #define IF_ENABLED_GRIDS(x) x +#else + #define IF_ENABLED_GRIDS(x) +#endif +""" + ) + + +with open("CMakeLists.txt", "r") as cmake_file: + cmake_content = cmake_file.read() + major_version = int(re.compile(r"EMBREE_VERSION_MAJOR\s(\d+)").findall(cmake_content)[0]) + minor_version = int(re.compile(r"EMBREE_VERSION_MINOR\s(\d+)").findall(cmake_content)[0]) + patch_version = int(re.compile(r"EMBREE_VERSION_PATCH\s(\d+)").findall(cmake_content)[0]) + +with open(os.path.join(dest_dir, "include/embree3/rtcore_config.h"), "w") as config_file: + config_file.write( + f""" +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define RTC_VERSION_MAJOR {major_version} +#define RTC_VERSION_MINOR {minor_version} +#define RTC_VERSION_PATCH {patch_version} +#define RTC_VERSION {major_version}{minor_version:02d}{patch_version:02d} +#define RTC_VERSION_STRING "{major_version}.{minor_version}.{patch_version}" + +#define RTC_MAX_INSTANCE_LEVEL_COUNT 1 + +#define EMBREE_MIN_WIDTH 0 +#define RTC_MIN_WIDTH EMBREE_MIN_WIDTH + +#define EMBREE_STATIC_LIB +/* #undef EMBREE_API_NAMESPACE */ + +#if defined(EMBREE_API_NAMESPACE) +# define RTC_NAMESPACE +# define RTC_NAMESPACE_BEGIN namespace {{ +# define RTC_NAMESPACE_END }} +# define RTC_NAMESPACE_USE using namespace ; +# define RTC_API_EXTERN_C +# undef EMBREE_API_NAMESPACE +#else +# define RTC_NAMESPACE_BEGIN +# define RTC_NAMESPACE_END +# define RTC_NAMESPACE_USE +# if defined(__cplusplus) +# define RTC_API_EXTERN_C extern "C" +# else +# define RTC_API_EXTERN_C +# endif +#endif + +#if defined(ISPC) +# define RTC_API_IMPORT extern "C" unmasked +# define RTC_API_EXPORT extern "C" unmasked +#elif defined(EMBREE_STATIC_LIB) +# define RTC_API_IMPORT RTC_API_EXTERN_C +# define RTC_API_EXPORT RTC_API_EXTERN_C +#elif defined(_WIN32) +# define RTC_API_IMPORT RTC_API_EXTERN_C __declspec(dllimport) +# define RTC_API_EXPORT RTC_API_EXTERN_C __declspec(dllexport) +#else +# define RTC_API_IMPORT RTC_API_EXTERN_C +# define RTC_API_EXPORT RTC_API_EXTERN_C __attribute__ ((visibility ("default"))) +#endif + +#if defined(RTC_EXPORT_API) +# define RTC_API RTC_API_EXPORT +#else +# define RTC_API RTC_API_IMPORT +#endif +""" + ) + +os.chdir("..") +shutil.rmtree("embree-tmp") diff --git a/modules/raycast/lightmap_raycaster.cpp b/modules/raycast/lightmap_raycaster.cpp new file mode 100644 index 00000000000..9039622d3da --- /dev/null +++ b/modules/raycast/lightmap_raycaster.cpp @@ -0,0 +1,202 @@ +/*************************************************************************/ +/* lightmap_raycaster.cpp */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#ifdef TOOLS_ENABLED + +#include "lightmap_raycaster.h" + +// From Embree. +#include +#include + +#include + +using namespace embree; + +LightmapRaycaster *LightmapRaycasterEmbree::create_embree_raycaster() { + return memnew(LightmapRaycasterEmbree); +} + +void LightmapRaycasterEmbree::make_default_raycaster() { + create_function = create_embree_raycaster; +} + +void LightmapRaycasterEmbree::filter_function(const struct RTCFilterFunctionNArguments *p_args) { + RTCHit *hit = (RTCHit *)p_args->hit; + + unsigned int geomID = hit->geomID; + float u = hit->u; + float v = hit->v; + + LightmapRaycasterEmbree *scene = (LightmapRaycasterEmbree *)p_args->geometryUserPtr; + RTCGeometry geom = rtcGetGeometry(scene->embree_scene, geomID); + + rtcInterpolate0(geom, hit->primID, hit->u, hit->v, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 0, &hit->u, 2); + + if (scene->alpha_textures.has(geomID)) { + const AlphaTextureData &alpha_texture = scene->alpha_textures[geomID]; + + if (alpha_texture.sample(hit->u, hit->v) < 128) { + p_args->valid[0] = 0; + return; + } + } + + rtcInterpolate0(geom, hit->primID, u, v, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 1, &hit->Ng_x, 3); +} + +bool LightmapRaycasterEmbree::intersect(Ray &r_ray) { + RTCIntersectContext context; + + rtcInitIntersectContext(&context); + + rtcIntersect1(embree_scene, &context, (RTCRayHit *)&r_ray); + return r_ray.geomID != RTC_INVALID_GEOMETRY_ID; +} + +void LightmapRaycasterEmbree::intersect(Vector &r_rays) { + Ray *rays = r_rays.ptrw(); + for (int i = 0; i < r_rays.size(); ++i) { + intersect(rays[i]); + } +} + +void LightmapRaycasterEmbree::set_mesh_alpha_texture(Ref p_alpha_texture, unsigned int p_id) { + if (p_alpha_texture.is_valid() && p_alpha_texture->get_size() != Vector2i()) { + AlphaTextureData tex; + tex.size = p_alpha_texture->get_size(); + tex.data = p_alpha_texture->get_data(); + alpha_textures.insert(p_id, tex); + } +} + +float blerp(float c00, float c10, float c01, float c11, float tx, float ty) { + return Math::lerp(Math::lerp(c00, c10, tx), Math::lerp(c01, c11, tx), ty); +} + +uint8_t LightmapRaycasterEmbree::AlphaTextureData::sample(float u, float v) const { + float x = u * size.x; + float y = v * size.y; + int xi = (int)x; + int yi = (int)y; + + uint8_t texels[4]; + + for (int i = 0; i < 4; ++i) { + int sample_x = CLAMP(xi + i % 2, 0, size.x - 1); + int sample_y = CLAMP(yi + i / 2, 0, size.y - 1); + texels[i] = data[sample_y * size.x + sample_x]; + } + + return Math::round(blerp(texels[0], texels[1], texels[2], texels[3], x - xi, y - yi)); +} + +void LightmapRaycasterEmbree::add_mesh(const Vector &p_vertices, const Vector &p_normals, const Vector &p_uv2s, unsigned int p_id) { + RTCGeometry embree_mesh = rtcNewGeometry(embree_device, RTC_GEOMETRY_TYPE_TRIANGLE); + + rtcSetGeometryVertexAttributeCount(embree_mesh, 2); + + int vertex_count = p_vertices.size(); + + ERR_FAIL_COND(vertex_count % 3 != 0); + ERR_FAIL_COND(vertex_count != p_uv2s.size()); + + Vec3fa *embree_vertices = (Vec3fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX, 0, RTC_FORMAT_FLOAT3, sizeof(Vec3fa), vertex_count); + Vec2fa *embree_light_uvs = (Vec2fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 0, RTC_FORMAT_FLOAT2, sizeof(Vec2fa), vertex_count); + uint32_t *embree_triangles = (uint32_t *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT3, sizeof(uint32_t) * 3, vertex_count / 3); + + Vec3fa *embree_normals = nullptr; + if (!p_normals.is_empty()) { + embree_normals = (Vec3fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 1, RTC_FORMAT_FLOAT3, sizeof(Vec3fa), vertex_count); + } + + for (int i = 0; i < vertex_count; i++) { + embree_vertices[i] = Vec3fa(p_vertices[i].x, p_vertices[i].y, p_vertices[i].z); + embree_light_uvs[i] = Vec2fa(p_uv2s[i].x, p_uv2s[i].y); + if (embree_normals != nullptr) { + embree_normals[i] = Vec3fa(p_normals[i].x, p_normals[i].y, p_normals[i].z); + } + embree_triangles[i] = i; + } + + rtcCommitGeometry(embree_mesh); + rtcSetGeometryIntersectFilterFunction(embree_mesh, filter_function); + rtcSetGeometryUserData(embree_mesh, this); + rtcAttachGeometryByID(embree_scene, embree_mesh, p_id); + rtcReleaseGeometry(embree_mesh); +} + +void LightmapRaycasterEmbree::commit() { + rtcCommitScene(embree_scene); +} + +void LightmapRaycasterEmbree::set_mesh_filter(const Set &p_mesh_ids) { + for (Set::Element *E = p_mesh_ids.front(); E; E = E->next()) { + rtcDisableGeometry(rtcGetGeometry(embree_scene, E->get())); + } + rtcCommitScene(embree_scene); + filter_meshes = p_mesh_ids; +} + +void LightmapRaycasterEmbree::clear_mesh_filter() { + for (Set::Element *E = filter_meshes.front(); E; E = E->next()) { + rtcEnableGeometry(rtcGetGeometry(embree_scene, E->get())); + } + rtcCommitScene(embree_scene); + filter_meshes.clear(); +} + +void embree_error_handler(void *p_user_data, RTCError p_code, const char *p_str) { + print_error("Embree error: " + String(p_str)); +} + +LightmapRaycasterEmbree::LightmapRaycasterEmbree() { + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); + + embree_device = rtcNewDevice(nullptr); + rtcSetDeviceErrorFunction(embree_device, &embree_error_handler, nullptr); + embree_scene = rtcNewScene(embree_device); +} + +LightmapRaycasterEmbree::~LightmapRaycasterEmbree() { + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF); + + if (embree_scene != nullptr) { + rtcReleaseScene(embree_scene); + } + + if (embree_device != nullptr) { + rtcReleaseDevice(embree_device); + } +} + +#endif diff --git a/modules/raycast/lightmap_raycaster.h b/modules/raycast/lightmap_raycaster.h new file mode 100644 index 00000000000..4c3de278374 --- /dev/null +++ b/modules/raycast/lightmap_raycaster.h @@ -0,0 +1,77 @@ +/*************************************************************************/ +/* lightmap_raycaster.h */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#ifdef TOOLS_ENABLED + +#include "core/object/object.h" +#include "scene/3d/lightmapper.h" +#include "scene/resources/mesh.h" + +#include + +class LightmapRaycasterEmbree : public LightmapRaycaster { + GDCLASS(LightmapRaycasterEmbree, LightmapRaycaster); + +private: + struct AlphaTextureData { + Vector data; + Vector2i size; + + uint8_t sample(float u, float v) const; + }; + + RTCDevice embree_device; + RTCScene embree_scene; + + static void filter_function(const struct RTCFilterFunctionNArguments *p_args); + + Map alpha_textures; + Set filter_meshes; + +public: + virtual bool intersect(Ray &p_ray) override; + + virtual void intersect(Vector &r_rays) override; + + virtual void add_mesh(const Vector &p_vertices, const Vector &p_normals, const Vector &p_uv2s, unsigned int p_id) override; + virtual void set_mesh_alpha_texture(Ref p_alpha_texture, unsigned int p_id) override; + virtual void commit() override; + + virtual void set_mesh_filter(const Set &p_mesh_ids) override; + virtual void clear_mesh_filter() override; + + static LightmapRaycaster *create_embree_raycaster(); + static void make_default_raycaster(); + + LightmapRaycasterEmbree(); + ~LightmapRaycasterEmbree(); +}; + +#endif diff --git a/modules/raycast/raycast_occlusion_cull.cpp b/modules/raycast/raycast_occlusion_cull.cpp new file mode 100644 index 00000000000..ea43255eef8 --- /dev/null +++ b/modules/raycast/raycast_occlusion_cull.cpp @@ -0,0 +1,583 @@ +/*************************************************************************/ +/* raycast_occlusion_cull.cpp */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#include "raycast_occlusion_cull.h" +#include "core/config/project_settings.h" +#include "core/templates/local_vector.h" + +#ifdef __SSE2__ +#include +#endif + +RaycastOcclusionCull *RaycastOcclusionCull::raycast_singleton = nullptr; + +void RaycastOcclusionCull::RaycastHZBuffer::clear() { + HZBuffer::clear(); + + camera_rays.clear(); + camera_ray_masks.clear(); + packs_size = Size2i(); +} + +void RaycastOcclusionCull::RaycastHZBuffer::resize(const Size2i &p_size) { + if (p_size == Size2i()) { + clear(); + return; + } + + if (!sizes.is_empty() && p_size == sizes[0]) { + return; // Size didn't change + } + + HZBuffer::resize(p_size); + + packs_size = Size2i(Math::ceil(p_size.x / (float)TILE_SIZE), Math::ceil(p_size.y / (float)TILE_SIZE)); + int ray_packets_count = packs_size.x * packs_size.y; + camera_rays.resize(ray_packets_count); + camera_ray_masks.resize(ray_packets_count * TILE_SIZE * TILE_SIZE); +} + +void RaycastOcclusionCull::RaycastHZBuffer::update_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_work_pool) { + CameraRayThreadData td; + td.camera_matrix = p_cam_projection; + td.camera_transform = p_cam_transform; + td.camera_orthogonal = p_cam_orthogonal; + td.thread_count = p_thread_work_pool.get_thread_count(); + + p_thread_work_pool.do_work(td.thread_count, this, &RaycastHZBuffer::_camera_rays_threaded, &td); +} + +void RaycastOcclusionCull::RaycastHZBuffer::_camera_rays_threaded(uint32_t p_thread, RaycastOcclusionCull::RaycastHZBuffer::CameraRayThreadData *p_data) { + uint32_t packs_total = camera_rays.size(); + uint32_t total_threads = p_data->thread_count; + uint32_t from = p_thread * packs_total / total_threads; + uint32_t to = (p_thread + 1 == total_threads) ? packs_total : ((p_thread + 1) * packs_total / total_threads); + _generate_camera_rays(p_data->camera_transform, p_data->camera_matrix, p_data->camera_orthogonal, from, to); +} + +void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, int p_from, int p_to) { + Size2i buffer_size = sizes[0]; + + CameraMatrix inv_camera_matrix = p_cam_projection.inverse(); + float z_far = p_cam_projection.get_z_far() * 1.05f; + debug_tex_range = z_far; + + RayPacket *ray_packets = camera_rays.ptr(); + uint32_t *ray_masks = camera_ray_masks.ptr(); + + for (int i = p_from; i < p_to; i++) { + RayPacket &packet = ray_packets[i]; + int tile_x = (i % packs_size.x) * TILE_SIZE; + int tile_y = (i / packs_size.x) * TILE_SIZE; + + for (int j = 0; j < TILE_RAYS; j++) { + float x = tile_x + j % TILE_SIZE; + float y = tile_y + j / TILE_SIZE; + + ray_masks[i * TILE_RAYS + j] = ~0U; + + if (x >= buffer_size.x || y >= buffer_size.y) { + ray_masks[i * TILE_RAYS + j] = 0U; + } else { + float u = x / (buffer_size.x - 1); + float v = y / (buffer_size.y - 1); + u = u * 2.0f - 1.0f; + v = v * 2.0f - 1.0f; + + Plane pixel_proj = Plane(u, v, -1.0, 1.0); + Plane pixel_view = inv_camera_matrix.xform4(pixel_proj); + Vector3 pixel_world = p_cam_transform.xform(pixel_view.normal); + + Vector3 dir; + if (p_cam_orthogonal) { + dir = -p_cam_transform.basis.get_axis(2); + } else { + dir = (pixel_world - p_cam_transform.origin).normalized(); + } + + packet.ray.org_x[j] = pixel_world.x; + packet.ray.org_y[j] = pixel_world.y; + packet.ray.org_z[j] = pixel_world.z; + + packet.ray.dir_x[j] = dir.x; + packet.ray.dir_y[j] = dir.y; + packet.ray.dir_z[j] = dir.z; + + packet.ray.tnear[j] = 0.0f; + + packet.ray.time[j] = 0.0f; + + packet.ray.flags[j] = 0; + packet.ray.mask[j] = -1; + packet.hit.geomID[j] = RTC_INVALID_GEOMETRY_ID; + } + + packet.ray.tfar[j] = z_far; + } + } +} + +void RaycastOcclusionCull::RaycastHZBuffer::sort_rays() { + if (is_empty()) { + return; + } + + Size2i buffer_size = sizes[0]; + for (int i = 0; i < packs_size.y; i++) { + for (int j = 0; j < packs_size.x; j++) { + for (int tile_i = 0; tile_i < TILE_SIZE; tile_i++) { + for (int tile_j = 0; tile_j < TILE_SIZE; tile_j++) { + int x = j * TILE_SIZE + tile_j; + int y = i * TILE_SIZE + tile_i; + if (x >= buffer_size.x || y >= buffer_size.y) { + continue; + } + int k = tile_i * TILE_SIZE + tile_j; + int packet_index = i * packs_size.x + j; + mips[0][y * buffer_size.x + x] = camera_rays[packet_index].ray.tfar[k]; + } + } + } + } +} + +//////////////////////////////////////////////////////// + +bool RaycastOcclusionCull::is_occluder(RID p_rid) { + return occluder_owner.owns(p_rid); +} + +RID RaycastOcclusionCull::occluder_allocate() { + return occluder_owner.allocate_rid(); +} + +void RaycastOcclusionCull::occluder_initialize(RID p_occluder) { + Occluder *occluder = memnew(Occluder); + occluder_owner.initialize_rid(p_occluder, occluder); +} + +void RaycastOcclusionCull::occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) { + Occluder *occluder = occluder_owner.getornull(p_occluder); + ERR_FAIL_COND(!occluder); + + occluder->vertices = p_vertices; + occluder->indices = p_indices; + + for (Set::Element *E = occluder->users.front(); E; E = E->next()) { + RID scenario_rid = E->get().scenario; + RID instance_rid = E->get().instance; + ERR_CONTINUE(!scenarios.has(scenario_rid)); + Scenario &scenario = scenarios[scenario_rid]; + ERR_CONTINUE(!scenario.instances.has(instance_rid)); + + if (!scenario.dirty_instances.has(instance_rid)) { + scenario.dirty_instances.insert(instance_rid); + scenario.dirty_instances_array.push_back(instance_rid); + } + } +} + +void RaycastOcclusionCull::free_occluder(RID p_occluder) { + Occluder *occluder = occluder_owner.getornull(p_occluder); + ERR_FAIL_COND(!occluder); + memdelete(occluder); + occluder_owner.free(p_occluder); +} + +//////////////////////////////////////////////////////// + +void RaycastOcclusionCull::add_scenario(RID p_scenario) { + if (scenarios.has(p_scenario)) { + scenarios[p_scenario].removed = false; + } else { + scenarios[p_scenario] = Scenario(); + } +} + +void RaycastOcclusionCull::remove_scenario(RID p_scenario) { + ERR_FAIL_COND(!scenarios.has(p_scenario)); + Scenario &scenario = scenarios[p_scenario]; + scenario.removed = true; +} + +void RaycastOcclusionCull::scenario_set_instance(RID p_scenario, RID p_instance, RID p_occluder, const Transform &p_xform, bool p_enabled) { + ERR_FAIL_COND(!scenarios.has(p_scenario)); + Scenario &scenario = scenarios[p_scenario]; + + if (!scenario.instances.has(p_instance)) { + scenario.instances[p_instance] = OccluderInstance(); + } + + OccluderInstance &instance = scenario.instances[p_instance]; + + if (instance.removed) { + instance.removed = false; + scenario.removed_instances.erase(p_instance); + } + + bool changed = false; + + if (instance.occluder != p_occluder) { + Occluder *old_occluder = occluder_owner.getornull(instance.occluder); + if (old_occluder) { + old_occluder->users.erase(InstanceID(p_scenario, p_instance)); + } + + instance.occluder = p_occluder; + + if (p_occluder.is_valid()) { + Occluder *occluder = occluder_owner.getornull(p_occluder); + ERR_FAIL_COND(!occluder); + occluder->users.insert(InstanceID(p_scenario, p_instance)); + } + changed = true; + } + + if (instance.xform != p_xform) { + scenario.instances[p_instance].xform = p_xform; + changed = true; + } + + if (instance.enabled != p_enabled) { + instance.enabled = p_enabled; + scenario.dirty = true; // The scenario needs a scene re-build, but the instance doesn't need update + } + + if (changed && !scenario.dirty_instances.has(p_instance)) { + scenario.dirty_instances.insert(p_instance); + scenario.dirty_instances_array.push_back(p_instance); + scenario.dirty = true; + } +} + +void RaycastOcclusionCull::scenario_remove_instance(RID p_scenario, RID p_instance) { + ERR_FAIL_COND(!scenarios.has(p_scenario)); + Scenario &scenario = scenarios[p_scenario]; + + if (scenario.instances.has(p_instance)) { + OccluderInstance &instance = scenario.instances[p_instance]; + + if (!instance.removed) { + Occluder *occluder = occluder_owner.getornull(instance.occluder); + if (occluder) { + occluder->users.erase(InstanceID(p_scenario, p_instance)); + } + + scenario.removed_instances.push_back(p_instance); + instance.removed = true; + } + } +} + +void RaycastOcclusionCull::Scenario::_update_dirty_instance_thread(int p_idx, RID *p_instances) { + _update_dirty_instance(p_idx, p_instances, nullptr); +} + +void RaycastOcclusionCull::Scenario::_update_dirty_instance(int p_idx, RID *p_instances, ThreadWorkPool *p_thread_pool) { + OccluderInstance *occ_inst = instances.getptr(p_instances[p_idx]); + + if (!occ_inst) { + return; + } + + Occluder *occ = raycast_singleton->occluder_owner.getornull(occ_inst->occluder); + + if (!occ) { + return; + } + + int vertices_size = occ->vertices.size(); + + // Embree requires the last element to be readable by a 16-byte SSE load instruction, so we add padding to be safe. + occ_inst->xformed_vertices.resize(vertices_size + 1); + + const Vector3 *read_ptr = occ->vertices.ptr(); + Vector3 *write_ptr = occ_inst->xformed_vertices.ptr(); + + if (p_thread_pool && vertices_size > 1024) { + TransformThreadData td; + td.xform = occ_inst->xform; + td.read = read_ptr; + td.write = write_ptr; + td.vertex_count = vertices_size; + td.thread_count = p_thread_pool->get_thread_count(); + p_thread_pool->do_work(td.thread_count, this, &Scenario::_transform_vertices_thread, &td); + } else { + _transform_vertices_range(read_ptr, write_ptr, occ_inst->xform, 0, vertices_size); + } + + occ_inst->indices.resize(occ->indices.size()); + copymem(occ_inst->indices.ptr(), occ->indices.ptr(), occ->indices.size() * sizeof(int32_t)); +} + +void RaycastOcclusionCull::Scenario::_transform_vertices_thread(uint32_t p_thread, TransformThreadData *p_data) { + uint32_t vertex_total = p_data->vertex_count; + uint32_t total_threads = p_data->thread_count; + uint32_t from = p_thread * vertex_total / total_threads; + uint32_t to = (p_thread + 1 == total_threads) ? vertex_total : ((p_thread + 1) * vertex_total / total_threads); + _transform_vertices_range(p_data->read, p_data->write, p_data->xform, from, to); +} + +void RaycastOcclusionCull::Scenario::_transform_vertices_range(const Vector3 *p_read, Vector3 *p_write, const Transform &p_xform, int p_from, int p_to) { + for (int i = p_from; i < p_to; i++) { + p_write[i] = p_xform.xform(p_read[i]); + } +} + +void RaycastOcclusionCull::Scenario::_commit_scene(void *p_ud) { + Scenario *scenario = (Scenario *)p_ud; + int commit_idx = 1 - (scenario->current_scene_idx); + rtcCommitScene(scenario->ebr_scene[commit_idx]); + scenario->commit_done = true; +} + +bool RaycastOcclusionCull::Scenario::update(ThreadWorkPool &p_thread_pool) { + ERR_FAIL_COND_V(singleton == nullptr, false); + + if (commit_thread == nullptr) { + commit_thread = memnew(Thread); + } + + if (commit_thread->is_started()) { + if (commit_done) { + commit_thread->wait_to_finish(); + current_scene_idx = 1 - current_scene_idx; + } else { + return false; + } + } + + if (removed) { + if (ebr_scene[0]) { + rtcReleaseScene(ebr_scene[0]); + } + if (ebr_scene[1]) { + rtcReleaseScene(ebr_scene[1]); + } + return true; + } + + if (!dirty && removed_instances.is_empty() && dirty_instances_array.is_empty()) { + return false; + } + + for (unsigned int i = 0; i < removed_instances.size(); i++) { + instances.erase(removed_instances[i]); + } + + if (dirty_instances_array.size() / p_thread_pool.get_thread_count() > 128) { + // Lots of instances, use per-instance threading + p_thread_pool.do_work(dirty_instances_array.size(), this, &Scenario::_update_dirty_instance_thread, dirty_instances_array.ptr()); + } else { + // Few instances, use threading on the vertex transforms + for (unsigned int i = 0; i < dirty_instances_array.size(); i++) { + _update_dirty_instance(i, dirty_instances_array.ptr(), &p_thread_pool); + } + } + + dirty_instances.clear(); + dirty_instances_array.clear(); + removed_instances.clear(); + + if (raycast_singleton->ebr_device == nullptr) { + raycast_singleton->_init_embree(); + } + + int next_scene_idx = 1 - current_scene_idx; + RTCScene &next_scene = ebr_scene[next_scene_idx]; + + if (next_scene) { + rtcReleaseScene(next_scene); + } + + next_scene = rtcNewScene(raycast_singleton->ebr_device); + rtcSetSceneBuildQuality(next_scene, RTCBuildQuality(raycast_singleton->build_quality)); + + const RID *inst_rid = nullptr; + while ((inst_rid = instances.next(inst_rid))) { + OccluderInstance *occ_inst = instances.getptr(*inst_rid); + Occluder *occ = raycast_singleton->occluder_owner.getornull(occ_inst->occluder); + + if (!occ || !occ_inst->enabled) { + continue; + } + + RTCGeometry geom = rtcNewGeometry(raycast_singleton->ebr_device, RTC_GEOMETRY_TYPE_TRIANGLE); + rtcSetSharedGeometryBuffer(geom, RTC_BUFFER_TYPE_VERTEX, 0, RTC_FORMAT_FLOAT3, occ_inst->xformed_vertices.ptr(), 0, sizeof(Vector3), occ_inst->xformed_vertices.size()); + rtcSetSharedGeometryBuffer(geom, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT3, occ_inst->indices.ptr(), 0, sizeof(uint32_t) * 3, occ_inst->indices.size() / 3); + rtcCommitGeometry(geom); + rtcAttachGeometry(next_scene, geom); + rtcReleaseGeometry(geom); + } + + dirty = false; + commit_done = false; + commit_thread->start(&Scenario::_commit_scene, this); + return false; +} + +void RaycastOcclusionCull::Scenario::_raycast(uint32_t p_idx, const RaycastThreadData *p_raycast_data) const { + RTCIntersectContext ctx; + rtcInitIntersectContext(&ctx); + ctx.flags = RTC_INTERSECT_CONTEXT_FLAG_COHERENT; + + rtcIntersect16((const int *)&p_raycast_data->masks[p_idx * TILE_RAYS], ebr_scene[current_scene_idx], &ctx, &p_raycast_data->rays[p_idx]); +} + +void RaycastOcclusionCull::Scenario::raycast(LocalVector &r_rays, const LocalVector p_valid_masks, ThreadWorkPool &p_thread_pool) const { + ERR_FAIL_COND(singleton == nullptr); + if (raycast_singleton->ebr_device == nullptr) { + return; // Embree is initialized on demand when there is some scenario with occluders in it. + } + + if (ebr_scene[current_scene_idx] == nullptr) { + return; + } + + RaycastThreadData td; + td.rays = r_rays.ptr(); + td.masks = p_valid_masks.ptr(); + + p_thread_pool.do_work(r_rays.size(), this, &Scenario::_raycast, &td); +} + +//////////////////////////////////////////////////////// + +void RaycastOcclusionCull::add_buffer(RID p_buffer) { + ERR_FAIL_COND(buffers.has(p_buffer)); + buffers[p_buffer] = RaycastHZBuffer(); +} + +void RaycastOcclusionCull::remove_buffer(RID p_buffer) { + ERR_FAIL_COND(!buffers.has(p_buffer)); + buffers.erase(p_buffer); +} + +void RaycastOcclusionCull::buffer_set_scenario(RID p_buffer, RID p_scenario) { + ERR_FAIL_COND(!buffers.has(p_buffer)); + ERR_FAIL_COND(p_scenario.is_valid() && !scenarios.has(p_scenario)); + buffers[p_buffer].scenario_rid = p_scenario; +} + +void RaycastOcclusionCull::buffer_set_size(RID p_buffer, const Vector2i &p_size) { + ERR_FAIL_COND(!buffers.has(p_buffer)); + buffers[p_buffer].resize(p_size); +} + +void RaycastOcclusionCull::buffer_update(RID p_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) { + if (!buffers.has(p_buffer)) { + return; + } + + RaycastHZBuffer &buffer = buffers[p_buffer]; + + if (buffer.is_empty() || !scenarios.has(buffer.scenario_rid)) { + return; + } + + Scenario &scenario = scenarios[buffer.scenario_rid]; + + bool removed = scenario.update(p_thread_pool); + + if (removed) { + scenarios.erase(buffer.scenario_rid); + return; + } + + buffer.update_camera_rays(p_cam_transform, p_cam_projection, p_cam_orthogonal, p_thread_pool); + + scenario.raycast(buffer.camera_rays, buffer.camera_ray_masks, p_thread_pool); + buffer.sort_rays(); + buffer.update_mips(); +} + +RaycastOcclusionCull::HZBuffer *RaycastOcclusionCull::buffer_get_ptr(RID p_buffer) { + if (!buffers.has(p_buffer)) { + return nullptr; + } + return &buffers[p_buffer]; +} + +RID RaycastOcclusionCull::buffer_get_debug_texture(RID p_buffer) { + ERR_FAIL_COND_V(!buffers.has(p_buffer), RID()); + return buffers[p_buffer].get_debug_texture(); +} + +//////////////////////////////////////////////////////// + +void RaycastOcclusionCull::set_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) { + if (build_quality == p_quality) { + return; + } + + build_quality = p_quality; + + const RID *scenario_rid = nullptr; + while ((scenario_rid = scenarios.next(scenario_rid))) { + scenarios[*scenario_rid].dirty = true; + } +} + +void RaycastOcclusionCull::_init_embree() { +#ifdef __SSE2__ + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); +#endif + + String settings = vformat("threads=%d", MAX(1, OS::get_singleton()->get_processor_count() - 2)); + ebr_device = rtcNewDevice(settings.utf8().ptr()); +} + +RaycastOcclusionCull::RaycastOcclusionCull() { + raycast_singleton = this; + int default_quality = GLOBAL_GET("rendering/occlusion_culling/bvh_build_quality"); + build_quality = RS::ViewportOcclusionCullingBuildQuality(default_quality); +} + +RaycastOcclusionCull::~RaycastOcclusionCull() { + const RID *scenario_rid = nullptr; + while ((scenario_rid = scenarios.next(scenario_rid))) { + Scenario &scenario = scenarios[*scenario_rid]; + if (scenario.commit_thread) { + scenario.commit_thread->wait_to_finish(); + memdelete(scenario.commit_thread); + } + } + + if (ebr_device != nullptr) { +#ifdef __SSE2__ + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF); +#endif + rtcReleaseDevice(ebr_device); + } + + raycast_singleton = nullptr; +} diff --git a/modules/raycast/raycast_occlusion_cull.h b/modules/raycast/raycast_occlusion_cull.h new file mode 100644 index 00000000000..acaceb9459f --- /dev/null +++ b/modules/raycast/raycast_occlusion_cull.h @@ -0,0 +1,184 @@ +/*************************************************************************/ +/* raycast_occlusion_cull.h */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#ifndef OCCLUSION_CULL_RAYCASTER_H +#define OCCLUSION_CULL_RAYCASTER_H + +#include "core/io/image.h" +#include "core/math/camera_matrix.h" +#include "core/object/object.h" +#include "core/object/reference.h" +#include "core/templates/local_vector.h" +#include "core/templates/rid_owner.h" +#include "scene/resources/mesh.h" +#include "servers/rendering/renderer_scene_occlusion_cull.h" + +#include + +class RaycastOcclusionCull : public RendererSceneOcclusionCull { + typedef RTCRayHit16 RayPacket; + +public: + class RaycastHZBuffer : public HZBuffer { + private: + Size2i packs_size; + + struct CameraRayThreadData { + CameraMatrix camera_matrix; + Transform camera_transform; + bool camera_orthogonal; + int thread_count; + Size2i buffer_size; + }; + + void _camera_rays_threaded(uint32_t p_thread, CameraRayThreadData *p_data); + void _generate_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, int p_from, int p_to); + + public: + LocalVector camera_rays; + LocalVector camera_ray_masks; + RID scenario_rid; + + virtual void clear() override; + virtual void resize(const Size2i &p_size) override; + void sort_rays(); + void update_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_work_pool); + }; + +private: + struct InstanceID { + RID scenario; + RID instance; + + bool operator<(const InstanceID &rhs) const { + if (instance == rhs.instance) { + return rhs.scenario < scenario; + } + return instance < rhs.instance; + } + + InstanceID() {} + InstanceID(RID s, RID i) : + scenario(s), instance(i) {} + }; + + struct Occluder { + PackedVector3Array vertices; + PackedInt32Array indices; + Set users; + }; + + struct OccluderInstance { + RID occluder; + LocalVector indices; + LocalVector xformed_vertices; + Transform xform; + bool enabled = true; + bool removed = false; + }; + + struct Scenario { + struct RaycastThreadData { + RayPacket *rays; + const uint32_t *masks; + }; + + struct TransformThreadData { + uint32_t thread_count; + uint32_t vertex_count; + Transform xform; + const Vector3 *read; + Vector3 *write; + }; + + Thread *commit_thread = nullptr; + bool commit_done = true; + bool dirty = false; + bool removed = false; + + RTCScene ebr_scene[2] = { nullptr, nullptr }; + int current_scene_idx = 0; + + HashMap instances; + Set dirty_instances; // To avoid duplicates + LocalVector dirty_instances_array; // To iterate and split into threads + LocalVector removed_instances; + + void _update_dirty_instance_thread(int p_idx, RID *p_instances); + void _update_dirty_instance(int p_idx, RID *p_instances, ThreadWorkPool *p_thread_pool); + void _transform_vertices_thread(uint32_t p_thread, TransformThreadData *p_data); + void _transform_vertices_range(const Vector3 *p_read, Vector3 *p_write, const Transform &p_xform, int p_from, int p_to); + static void _commit_scene(void *p_ud); + bool update(ThreadWorkPool &p_thread_pool); + + void _raycast(uint32_t p_thread, const RaycastThreadData *p_raycast_data) const; + void raycast(LocalVector &r_rays, const LocalVector p_valid_masks, ThreadWorkPool &p_thread_pool) const; + }; + + static RaycastOcclusionCull *raycast_singleton; + + static const int TILE_SIZE = 4; + static const int TILE_RAYS = TILE_SIZE * TILE_SIZE; + + RTCDevice ebr_device = nullptr; + RID_PtrOwner occluder_owner; + HashMap scenarios; + HashMap buffers; + RS::ViewportOcclusionCullingBuildQuality build_quality; + + void _init_embree(); + +public: + virtual bool is_occluder(RID p_rid) override; + virtual RID occluder_allocate() override; + virtual void occluder_initialize(RID p_occluder) override; + virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) override; + virtual void free_occluder(RID p_occluder) override; + + virtual void add_scenario(RID p_scenario) override; + virtual void remove_scenario(RID p_scenario) override; + virtual void scenario_set_instance(RID p_scenario, RID p_instance, RID p_occluder, const Transform &p_xform, bool p_enabled) override; + virtual void scenario_remove_instance(RID p_scenario, RID p_instance) override; + + virtual void add_buffer(RID p_buffer) override; + virtual void remove_buffer(RID p_buffer) override; + virtual HZBuffer *buffer_get_ptr(RID p_buffer) override; + virtual void buffer_set_scenario(RID p_buffer, RID p_scenario) override; + virtual void buffer_set_size(RID p_buffer, const Vector2i &p_size) override; + virtual void buffer_update(RID p_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) override; + virtual RID buffer_get_debug_texture(RID p_buffer) override; + + virtual void set_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) override; + + RaycastOcclusionCull(); + ~RaycastOcclusionCull(); +}; + +#endif // OCCLUSION_CULL_RAYCASTER_H diff --git a/modules/raycast/register_types.cpp b/modules/raycast/register_types.cpp new file mode 100644 index 00000000000..78ca91309f6 --- /dev/null +++ b/modules/raycast/register_types.cpp @@ -0,0 +1,49 @@ +/*************************************************************************/ +/* register_types.cpp */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#include "register_types.h" + +#include "lightmap_raycaster.h" +#include "raycast_occlusion_cull.h" + +RaycastOcclusionCull *raycast_occlusion_cull = nullptr; + +void register_raycast_types() { +#ifdef TOOLS_ENABLED + LightmapRaycasterEmbree::make_default_raycaster(); +#endif + raycast_occlusion_cull = memnew(RaycastOcclusionCull); +} + +void unregister_raycast_types() { + if (raycast_occlusion_cull) { + memdelete(raycast_occlusion_cull); + } +} diff --git a/modules/raycast/register_types.h b/modules/raycast/register_types.h new file mode 100644 index 00000000000..789604a4917 --- /dev/null +++ b/modules/raycast/register_types.h @@ -0,0 +1,32 @@ +/*************************************************************************/ +/* register_types.h */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +void register_raycast_types(); +void unregister_raycast_types(); diff --git a/scene/3d/baked_lightmap.cpp b/scene/3d/baked_lightmap.cpp index 2e1b77dfe59..ef648a126e7 100644 --- a/scene/3d/baked_lightmap.cpp +++ b/scene/3d/baked_lightmap.cpp @@ -619,10 +619,6 @@ void BakedLightmap::_gen_new_positions_from_octree(const GenProbesOctree *p_cell } BakedLightmap::BakeError BakedLightmap::bake(Node *p_from_node, String p_image_data_path, Lightmapper::BakeStepFunc p_bake_step, void *p_bake_userdata) { - if (p_image_data_path == "" && (get_light_data().is_null() || !get_light_data()->get_path().is_resource_file())) { - return BAKE_ERROR_NO_SAVE_PATH; - } - if (p_image_data_path == "") { if (get_light_data().is_null()) { return BAKE_ERROR_NO_SAVE_PATH; diff --git a/scene/3d/lightmapper.cpp b/scene/3d/lightmapper.cpp index c17ac52aa27..9e5078ba95e 100644 --- a/scene/3d/lightmapper.cpp +++ b/scene/3d/lightmapper.cpp @@ -39,6 +39,15 @@ Ref LightmapDenoiser::create() { return Ref(); } +LightmapRaycaster *(*LightmapRaycaster::create_function)() = nullptr; + +Ref LightmapRaycaster::create() { + if (create_function) { + return Ref(create_function()); + } + return Ref(); +} + Lightmapper::CreateFunc Lightmapper::create_custom = nullptr; Lightmapper::CreateFunc Lightmapper::create_gpu = nullptr; Lightmapper::CreateFunc Lightmapper::create_cpu = nullptr; diff --git a/scene/3d/lightmapper.h b/scene/3d/lightmapper.h index a07a964c01b..f63515f6664 100644 --- a/scene/3d/lightmapper.h +++ b/scene/3d/lightmapper.h @@ -34,6 +34,16 @@ #include "scene/resources/mesh.h" #include "servers/rendering/rendering_device.h" +#if !defined(__aligned) + +#if defined(_WIN32) && defined(_MSC_VER) +#define __aligned(...) __declspec(align(__VA_ARGS__)) +#else +#define __aligned(...) __attribute__((aligned(__VA_ARGS__))) +#endif + +#endif + class LightmapDenoiser : public Reference { GDCLASS(LightmapDenoiser, Reference) protected: @@ -44,6 +54,73 @@ public: static Ref create(); }; +class LightmapRaycaster : public Reference { + GDCLASS(LightmapRaycaster, Reference) +protected: + static LightmapRaycaster *(*create_function)(); + +public: + // compatible with embree3 rays + struct __aligned(16) Ray { + const static unsigned int INVALID_GEOMETRY_ID = ((unsigned int)-1); // from rtcore_common.h + + /*! Default construction does nothing. */ + _FORCE_INLINE_ Ray() : + geomID(INVALID_GEOMETRY_ID) {} + + /*! Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far. */ + _FORCE_INLINE_ Ray(const Vector3 &org, + const Vector3 &dir, + float tnear = 0.0f, + float tfar = INFINITY) : + org(org), + tnear(tnear), + dir(dir), + time(0.0f), + tfar(tfar), + mask(-1), + u(0.0), + v(0.0), + primID(INVALID_GEOMETRY_ID), + geomID(INVALID_GEOMETRY_ID), + instID(INVALID_GEOMETRY_ID) {} + + /*! Tests if we hit something. */ + _FORCE_INLINE_ explicit operator bool() const { return geomID != INVALID_GEOMETRY_ID; } + + public: + Vector3 org; //!< Ray origin + tnear + float tnear; //!< Start of ray segment + Vector3 dir; //!< Ray direction + tfar + float time; //!< Time of this ray for motion blur. + float tfar; //!< End of ray segment + unsigned int mask; //!< used to mask out objects during traversal + unsigned int id; //!< ray ID + unsigned int flags; //!< ray flags + + Vector3 normal; //!< Not normalized geometry normal + float u; //!< Barycentric u coordinate of hit + float v; //!< Barycentric v coordinate of hit + unsigned int primID; //!< primitive ID + unsigned int geomID; //!< geometry ID + unsigned int instID; //!< instance ID + }; + + virtual bool intersect(Ray &p_ray) = 0; + + virtual void intersect(Vector &r_rays) = 0; + + virtual void add_mesh(const Vector &p_vertices, const Vector &p_normals, const Vector &p_uv2s, unsigned int p_id) = 0; + virtual void set_mesh_alpha_texture(Ref p_alpha_texture, unsigned int p_id) = 0; + virtual void commit() = 0; + + virtual void set_mesh_filter(const Set &p_mesh_ids) = 0; + virtual void clear_mesh_filter() = 0; + + static Ref create(); +}; + class Lightmapper : public Reference { GDCLASS(Lightmapper, Reference) public: diff --git a/scene/3d/mesh_instance_3d.cpp b/scene/3d/mesh_instance_3d.cpp index 99fa8f1f3e7..27d5487a1a8 100644 --- a/scene/3d/mesh_instance_3d.cpp +++ b/scene/3d/mesh_instance_3d.cpp @@ -356,6 +356,7 @@ Ref MeshInstance3D::get_active_material(int p_surface) const { void MeshInstance3D::_mesh_changed() { ERR_FAIL_COND(mesh.is_null()); surface_override_materials.resize(mesh->get_surface_count()); + update_gizmo(); } void MeshInstance3D::create_debug_tangents() { diff --git a/scene/3d/occluder_instance_3d.cpp b/scene/3d/occluder_instance_3d.cpp new file mode 100644 index 00000000000..d3a256db34c --- /dev/null +++ b/scene/3d/occluder_instance_3d.cpp @@ -0,0 +1,335 @@ +/*************************************************************************/ +/* occluder_instance_3d.cpp */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#include "occluder_instance_3d.h" +#include "core/core_string_names.h" +#include "scene/3d/mesh_instance_3d.h" + +RID Occluder3D::get_rid() const { + if (!occluder.is_valid()) { + occluder = RS::get_singleton()->occluder_create(); + RS::get_singleton()->occluder_set_mesh(occluder, vertices, indices); + } + return occluder; +} + +void Occluder3D::set_vertices(PackedVector3Array p_vertices) { + vertices = p_vertices; + if (occluder.is_valid()) { + RS::get_singleton()->occluder_set_mesh(occluder, vertices, indices); + } + _update_changes(); +} + +PackedVector3Array Occluder3D::get_vertices() const { + return vertices; +} + +void Occluder3D::set_indices(PackedInt32Array p_indices) { + indices = p_indices; + if (occluder.is_valid()) { + RS::get_singleton()->occluder_set_mesh(occluder, vertices, indices); + } + _update_changes(); +} + +PackedInt32Array Occluder3D::get_indices() const { + return indices; +} + +void Occluder3D::_update_changes() { + aabb = AABB(); + + const Vector3 *ptr = vertices.ptr(); + for (int i = 0; i < vertices.size(); i++) { + aabb.expand_to(ptr[i]); + } + + debug_lines.clear(); + debug_mesh.unref(); + + emit_changed(); +} + +Vector Occluder3D::get_debug_lines() const { + if (!debug_lines.is_empty()) { + return debug_lines; + } + + if (indices.size() % 3 != 0) { + return Vector(); + } + + for (int i = 0; i < indices.size() / 3; i++) { + for (int j = 0; j < 3; j++) { + int a = indices[i * 3 + j]; + int b = indices[i * 3 + (j + 1) % 3]; + ERR_FAIL_INDEX_V_MSG(a, vertices.size(), Vector(), "Occluder indices are out of range."); + ERR_FAIL_INDEX_V_MSG(b, vertices.size(), Vector(), "Occluder indices are out of range."); + debug_lines.push_back(vertices[a]); + debug_lines.push_back(vertices[b]); + } + } + return debug_lines; +} + +Ref Occluder3D::get_debug_mesh() const { + if (debug_mesh.is_valid()) { + return debug_mesh; + } + + if (indices.size() % 3 != 0) { + return debug_mesh; + } + + Array arrays; + arrays.resize(Mesh::ARRAY_MAX); + arrays[Mesh::ARRAY_VERTEX] = vertices; + arrays[Mesh::ARRAY_INDEX] = indices; + + debug_mesh.instance(); + debug_mesh->add_surface_from_arrays(Mesh::PRIMITIVE_TRIANGLES, arrays); + return debug_mesh; +} + +AABB Occluder3D::get_aabb() const { + return aabb; +} + +void Occluder3D::_bind_methods() { + ClassDB::bind_method(D_METHOD("set_vertices", "vertices"), &Occluder3D::set_vertices); + ClassDB::bind_method(D_METHOD("get_vertices"), &Occluder3D::get_vertices); + + ClassDB::bind_method(D_METHOD("set_indices", "indices"), &Occluder3D::set_indices); + ClassDB::bind_method(D_METHOD("get_indices"), &Occluder3D::get_indices); + + ADD_PROPERTY(PropertyInfo(Variant::PACKED_VECTOR3_ARRAY, "vertices", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR), "set_vertices", "get_vertices"); + ADD_PROPERTY(PropertyInfo(Variant::PACKED_INT32_ARRAY, "indices", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR), "set_indices", "get_indices"); +} + +Occluder3D::Occluder3D() { +} + +Occluder3D::~Occluder3D() { + if (occluder.is_valid()) { + RS::get_singleton()->free(occluder); + } +} +///////////////////////////////////////////////// + +AABB OccluderInstance3D::get_aabb() const { + if (occluder.is_valid()) { + return occluder->get_aabb(); + } + return AABB(); +} + +Vector OccluderInstance3D::get_faces(uint32_t p_usage_flags) const { + return Vector(); +} + +void OccluderInstance3D::set_occluder(const Ref &p_occluder) { + if (occluder == p_occluder) { + return; + } + + if (occluder.is_valid()) { + occluder->disconnect(CoreStringNames::get_singleton()->changed, callable_mp(this, &OccluderInstance3D::_occluder_changed)); + } + + occluder = p_occluder; + + if (occluder.is_valid()) { + set_base(occluder->get_rid()); + occluder->connect(CoreStringNames::get_singleton()->changed, callable_mp(this, &OccluderInstance3D::_occluder_changed)); + } else { + set_base(RID()); + } + + update_gizmo(); +} + +void OccluderInstance3D::_occluder_changed() { + update_gizmo(); +} + +Ref OccluderInstance3D::get_occluder() const { + return occluder; +} + +void OccluderInstance3D::set_bake_mask(uint32_t p_mask) { + bake_mask = p_mask; +} + +uint32_t OccluderInstance3D::get_bake_mask() const { + return bake_mask; +} + +void OccluderInstance3D::set_bake_mask_bit(int p_layer, bool p_enable) { + ERR_FAIL_INDEX(p_layer, 32); + if (p_enable) { + set_bake_mask(bake_mask | (1 << p_layer)); + } else { + set_bake_mask(bake_mask & (~(1 << p_layer))); + } +} + +bool OccluderInstance3D::get_bake_mask_bit(int p_layer) const { + ERR_FAIL_INDEX_V(p_layer, 32, false); + return (bake_mask & (1 << p_layer)); +} + +bool OccluderInstance3D::_bake_material_check(Ref p_material) { + StandardMaterial3D *standard_mat = Object::cast_to(p_material.ptr()); + if (standard_mat && standard_mat->get_transparency() != StandardMaterial3D::TRANSPARENCY_DISABLED) { + return false; + } + return true; +} + +void OccluderInstance3D::_bake_node(Node *p_node, PackedVector3Array &r_vertices, PackedInt32Array &r_indices) { + MeshInstance3D *mi = Object::cast_to(p_node); + if (mi && mi->is_visible_in_tree()) { + Ref mesh = mi->get_mesh(); + bool valid = true; + + if (mesh.is_null()) { + valid = false; + } + + if (valid && !_bake_material_check(mi->get_material_override())) { + valid = false; + } + + if ((mi->get_layer_mask() & bake_mask) == 0) { + valid = false; + } + + if (valid) { + Transform global_to_local = get_global_transform().affine_inverse() * mi->get_global_transform(); + + for (int i = 0; i < mesh->get_surface_count(); i++) { + if (mesh->surface_get_primitive_type(i) != Mesh::PRIMITIVE_TRIANGLES) { + continue; + } + + if (mi->get_surface_override_material(i).is_valid()) { + if (!_bake_material_check(mi->get_surface_override_material(i))) { + continue; + } + } else { + if (!_bake_material_check(mesh->surface_get_material(i))) { + continue; + } + } + + Array arrays = mesh->surface_get_arrays(i); + + int vertex_offset = r_vertices.size(); + PackedVector3Array vertices = arrays[Mesh::ARRAY_VERTEX]; + r_vertices.resize(r_vertices.size() + vertices.size()); + + Vector3 *vtx_ptr = r_vertices.ptrw(); + for (int j = 0; j < vertices.size(); j++) { + vtx_ptr[vertex_offset + j] = global_to_local.xform(vertices[j]); + } + + int index_offset = r_indices.size(); + PackedInt32Array indices = arrays[Mesh::ARRAY_INDEX]; + r_indices.resize(r_indices.size() + indices.size()); + + int *idx_ptr = r_indices.ptrw(); + for (int j = 0; j < indices.size(); j++) { + idx_ptr[index_offset + j] = vertex_offset + indices[j]; + } + } + } + } + + for (int i = 0; i < p_node->get_child_count(); i++) { + Node *child = p_node->get_child(i); + if (!child->get_owner()) { + continue; //maybe a helper + } + + _bake_node(child, r_vertices, r_indices); + } +} + +OccluderInstance3D::BakeError OccluderInstance3D::bake(Node *p_from_node, String p_occluder_path) { + if (p_occluder_path == "") { + if (get_occluder().is_null()) { + return BAKE_ERROR_NO_SAVE_PATH; + } + } + + PackedVector3Array vertices; + PackedInt32Array indices; + + _bake_node(p_from_node, vertices, indices); + + if (vertices.is_empty() || indices.is_empty()) { + return BAKE_ERROR_NO_MESHES; + } + + Ref occ; + if (get_occluder().is_valid()) { + occ = get_occluder(); + } else { + occ.instance(); + occ->set_path(p_occluder_path); + } + + occ->set_vertices(vertices); + occ->set_indices(indices); + set_occluder(occ); + + return BAKE_ERROR_OK; +} + +void OccluderInstance3D::_bind_methods() { + ClassDB::bind_method(D_METHOD("set_bake_mask", "mask"), &OccluderInstance3D::set_bake_mask); + ClassDB::bind_method(D_METHOD("get_bake_mask"), &OccluderInstance3D::get_bake_mask); + ClassDB::bind_method(D_METHOD("set_bake_mask_bit", "layer", "enabled"), &OccluderInstance3D::set_bake_mask_bit); + ClassDB::bind_method(D_METHOD("get_bake_mask_bit", "layer"), &OccluderInstance3D::get_bake_mask_bit); + + ClassDB::bind_method(D_METHOD("set_occluder", "occluder"), &OccluderInstance3D::set_occluder); + ClassDB::bind_method(D_METHOD("get_occluder"), &OccluderInstance3D::get_occluder); + + ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "occluder", PROPERTY_HINT_RESOURCE_TYPE, "Occluder3D"), "set_occluder", "get_occluder"); + ADD_GROUP("Bake", "bake_"); + ADD_PROPERTY(PropertyInfo(Variant::INT, "bake_mask", PROPERTY_HINT_LAYERS_3D_RENDER), "set_bake_mask", "get_bake_mask"); +} + +OccluderInstance3D::OccluderInstance3D() { +} + +OccluderInstance3D::~OccluderInstance3D() { +} diff --git a/scene/3d/occluder_instance_3d.h b/scene/3d/occluder_instance_3d.h new file mode 100644 index 00000000000..4bb468274d2 --- /dev/null +++ b/scene/3d/occluder_instance_3d.h @@ -0,0 +1,108 @@ +/*************************************************************************/ +/* occluder_instance_3d.h */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#ifndef OCCLUDER_INSTANCE_3D_H +#define OCCLUDER_INSTANCE_3D_H + +#include "scene/3d/visual_instance_3d.h" + +class Occluder3D : public Resource { + GDCLASS(Occluder3D, Resource); + RES_BASE_EXTENSION("occ"); + + mutable RID occluder; + mutable Ref debug_mesh; + mutable Vector debug_lines; + AABB aabb; + + PackedVector3Array vertices; + PackedInt32Array indices; + + void _update_changes(); + +protected: + static void _bind_methods(); + +public: + void set_vertices(PackedVector3Array p_vertices); + PackedVector3Array get_vertices() const; + + void set_indices(PackedInt32Array p_indices); + PackedInt32Array get_indices() const; + + Vector get_debug_lines() const; + Ref get_debug_mesh() const; + AABB get_aabb() const; + + virtual RID get_rid() const override; + Occluder3D(); + ~Occluder3D(); +}; + +class OccluderInstance3D : public VisualInstance3D { + GDCLASS(OccluderInstance3D, Node3D); + +private: + Ref occluder; + uint32_t bake_mask = 0xFFFFFFFF; + + void _occluder_changed(); + + bool _bake_material_check(Ref p_material); + void _bake_node(Node *p_node, PackedVector3Array &r_vertices, PackedInt32Array &r_indices); + +protected: + static void _bind_methods(); + +public: + enum BakeError { + BAKE_ERROR_OK, + BAKE_ERROR_NO_SAVE_PATH, + BAKE_ERROR_NO_MESHES, + }; + + void set_occluder(const Ref &p_occluder); + Ref get_occluder() const; + + virtual AABB get_aabb() const override; + virtual Vector get_faces(uint32_t p_usage_flags) const override; + + void set_bake_mask(uint32_t p_mask); + uint32_t get_bake_mask() const; + + void set_bake_mask_bit(int p_layer, bool p_enable); + bool get_bake_mask_bit(int p_layer) const; + BakeError bake(Node *p_from_node, String p_occluder_path = ""); + + OccluderInstance3D(); + ~OccluderInstance3D(); +}; + +#endif diff --git a/scene/3d/visual_instance_3d.cpp b/scene/3d/visual_instance_3d.cpp index 394c67e873c..d81b09b86cb 100644 --- a/scene/3d/visual_instance_3d.cpp +++ b/scene/3d/visual_instance_3d.cpp @@ -338,6 +338,15 @@ GeometryInstance3D::GIMode GeometryInstance3D::get_gi_mode() const { return gi_mode; } +void GeometryInstance3D::set_ignore_occlusion_culling(bool p_enabled) { + ignore_occlusion_culling = p_enabled; + RS::get_singleton()->instance_geometry_set_flag(get_instance(), RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, ignore_occlusion_culling); +} + +bool GeometryInstance3D::is_ignoring_occlusion_culling() { + return ignore_occlusion_culling; +} + void GeometryInstance3D::_bind_methods() { ClassDB::bind_method(D_METHOD("set_material_override", "material"), &GeometryInstance3D::set_material_override); ClassDB::bind_method(D_METHOD("get_material_override"), &GeometryInstance3D::get_material_override); @@ -345,21 +354,24 @@ void GeometryInstance3D::_bind_methods() { ClassDB::bind_method(D_METHOD("set_cast_shadows_setting", "shadow_casting_setting"), &GeometryInstance3D::set_cast_shadows_setting); ClassDB::bind_method(D_METHOD("get_cast_shadows_setting"), &GeometryInstance3D::get_cast_shadows_setting); + ClassDB::bind_method(D_METHOD("set_lod_bias", "bias"), &GeometryInstance3D::set_lod_bias); + ClassDB::bind_method(D_METHOD("get_lod_bias"), &GeometryInstance3D::get_lod_bias); + ClassDB::bind_method(D_METHOD("set_lod_max_hysteresis", "mode"), &GeometryInstance3D::set_lod_max_hysteresis); ClassDB::bind_method(D_METHOD("get_lod_max_hysteresis"), &GeometryInstance3D::get_lod_max_hysteresis); ClassDB::bind_method(D_METHOD("set_lod_max_distance", "mode"), &GeometryInstance3D::set_lod_max_distance); ClassDB::bind_method(D_METHOD("get_lod_max_distance"), &GeometryInstance3D::get_lod_max_distance); - ClassDB::bind_method(D_METHOD("set_shader_instance_uniform", "uniform", "value"), &GeometryInstance3D::set_shader_instance_uniform); - ClassDB::bind_method(D_METHOD("get_shader_instance_uniform", "uniform"), &GeometryInstance3D::get_shader_instance_uniform); - ClassDB::bind_method(D_METHOD("set_lod_min_hysteresis", "mode"), &GeometryInstance3D::set_lod_min_hysteresis); ClassDB::bind_method(D_METHOD("get_lod_min_hysteresis"), &GeometryInstance3D::get_lod_min_hysteresis); ClassDB::bind_method(D_METHOD("set_lod_min_distance", "mode"), &GeometryInstance3D::set_lod_min_distance); ClassDB::bind_method(D_METHOD("get_lod_min_distance"), &GeometryInstance3D::get_lod_min_distance); + ClassDB::bind_method(D_METHOD("set_shader_instance_uniform", "uniform", "value"), &GeometryInstance3D::set_shader_instance_uniform); + ClassDB::bind_method(D_METHOD("get_shader_instance_uniform", "uniform"), &GeometryInstance3D::get_shader_instance_uniform); + ClassDB::bind_method(D_METHOD("set_extra_cull_margin", "margin"), &GeometryInstance3D::set_extra_cull_margin); ClassDB::bind_method(D_METHOD("get_extra_cull_margin"), &GeometryInstance3D::get_extra_cull_margin); @@ -369,8 +381,8 @@ void GeometryInstance3D::_bind_methods() { ClassDB::bind_method(D_METHOD("set_gi_mode", "mode"), &GeometryInstance3D::set_gi_mode); ClassDB::bind_method(D_METHOD("get_gi_mode"), &GeometryInstance3D::get_gi_mode); - ClassDB::bind_method(D_METHOD("set_lod_bias", "bias"), &GeometryInstance3D::set_lod_bias); - ClassDB::bind_method(D_METHOD("get_lod_bias"), &GeometryInstance3D::get_lod_bias); + ClassDB::bind_method(D_METHOD("set_ignore_occlusion_culling", "ignore_culling"), &GeometryInstance3D::set_ignore_occlusion_culling); + ClassDB::bind_method(D_METHOD("is_ignoring_occlusion_culling"), &GeometryInstance3D::is_ignoring_occlusion_culling); ClassDB::bind_method(D_METHOD("set_custom_aabb", "aabb"), &GeometryInstance3D::set_custom_aabb); @@ -381,6 +393,7 @@ void GeometryInstance3D::_bind_methods() { ADD_PROPERTY(PropertyInfo(Variant::INT, "cast_shadow", PROPERTY_HINT_ENUM, "Off,On,Double-Sided,Shadows Only"), "set_cast_shadows_setting", "get_cast_shadows_setting"); ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "extra_cull_margin", PROPERTY_HINT_RANGE, "0,16384,0.01"), "set_extra_cull_margin", "get_extra_cull_margin"); ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "lod_bias", PROPERTY_HINT_RANGE, "0.001,128,0.001"), "set_lod_bias", "get_lod_bias"); + ADD_PROPERTY(PropertyInfo(Variant::BOOL, "ignore_occlusion_culling"), "set_ignore_occlusion_culling", "is_ignoring_occlusion_culling"); ADD_GROUP("Global Illumination", "gi_"); ADD_PROPERTY(PropertyInfo(Variant::INT, "gi_mode", PROPERTY_HINT_ENUM, "Disabled,Baked,Dynamic"), "set_gi_mode", "get_gi_mode"); ADD_PROPERTY(PropertyInfo(Variant::INT, "gi_lightmap_scale", PROPERTY_HINT_ENUM, "1x,2x,4x,8x"), "set_lightmap_scale", "get_lightmap_scale"); diff --git a/scene/3d/visual_instance_3d.h b/scene/3d/visual_instance_3d.h index 7fed8095ef7..68d29ef81e9 100644 --- a/scene/3d/visual_instance_3d.h +++ b/scene/3d/visual_instance_3d.h @@ -120,6 +120,7 @@ private: float extra_cull_margin = 0.0; LightmapScale lightmap_scale = LIGHTMAP_SCALE_1X; GIMode gi_mode = GI_MODE_DISABLED; + bool ignore_occlusion_culling = false; const StringName *_instance_uniform_get_remap(const StringName p_name) const; @@ -167,6 +168,9 @@ public: void set_custom_aabb(AABB aabb); + void set_ignore_occlusion_culling(bool p_enabled); + bool is_ignoring_occlusion_culling(); + GeometryInstance3D(); }; diff --git a/scene/main/scene_tree.cpp b/scene/main/scene_tree.cpp index a62c4ff770a..387af3703b4 100644 --- a/scene/main/scene_tree.cpp +++ b/scene/main/scene_tree.cpp @@ -1378,6 +1378,9 @@ SceneTree::SceneTree() { const bool use_debanding = GLOBAL_DEF("rendering/anti_aliasing/quality/use_debanding", false); root->set_use_debanding(use_debanding); + const bool use_occlusion_culling = GLOBAL_DEF("rendering/occlusion_culling/use_occlusion_culling", false); + root->set_use_occlusion_culling(use_occlusion_culling); + float lod_threshold = GLOBAL_DEF("rendering/mesh_lod/lod_change/threshold_pixels", 1.0); ProjectSettings::get_singleton()->set_custom_property_info("rendering/mesh_lod/lod_change/threshold_pixels", PropertyInfo(Variant::FLOAT, "rendering/mesh_lod/lod_change/threshold_pixels", PROPERTY_HINT_RANGE, "0,1024,0.1")); root->set_lod_threshold(lod_threshold); diff --git a/scene/main/viewport.cpp b/scene/main/viewport.cpp index 4c9ebe016ec..f1613f2fe5f 100644 --- a/scene/main/viewport.cpp +++ b/scene/main/viewport.cpp @@ -3242,6 +3242,21 @@ float Viewport::get_lod_threshold() const { return lod_threshold; } +void Viewport::set_use_occlusion_culling(bool p_use_occlusion_culling) { + if (use_occlusion_culling == p_use_occlusion_culling) { + return; + } + + use_occlusion_culling = p_use_occlusion_culling; + RS::get_singleton()->viewport_set_use_occlusion_culling(viewport, p_use_occlusion_culling); + + notify_property_list_changed(); +} + +bool Viewport::is_using_occlusion_culling() const { + return use_occlusion_culling; +} + void Viewport::set_debug_draw(DebugDraw p_debug_draw) { debug_draw = p_debug_draw; RS::get_singleton()->viewport_set_debug_draw(viewport, RS::ViewportDebugDraw(p_debug_draw)); @@ -3331,9 +3346,6 @@ bool Viewport::is_handling_input_locally() const { return handle_input_locally; } -void Viewport::_validate_property(PropertyInfo &property) const { -} - void Viewport::set_default_canvas_item_texture_filter(DefaultCanvasItemTextureFilter p_filter) { ERR_FAIL_INDEX(p_filter, DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_MAX); @@ -3478,6 +3490,9 @@ void Viewport::_bind_methods() { ClassDB::bind_method(D_METHOD("set_use_debanding", "enable"), &Viewport::set_use_debanding); ClassDB::bind_method(D_METHOD("is_using_debanding"), &Viewport::is_using_debanding); + ClassDB::bind_method(D_METHOD("set_use_occlusion_culling", "enable"), &Viewport::set_use_occlusion_culling); + ClassDB::bind_method(D_METHOD("is_using_occlusion_culling"), &Viewport::is_using_occlusion_culling); + ClassDB::bind_method(D_METHOD("set_debug_draw", "debug_draw"), &Viewport::set_debug_draw); ClassDB::bind_method(D_METHOD("get_debug_draw"), &Viewport::get_debug_draw); @@ -3574,6 +3589,7 @@ void Viewport::_bind_methods() { ADD_PROPERTY(PropertyInfo(Variant::INT, "msaa", PROPERTY_HINT_ENUM, "Disabled,2x,4x,8x,16x,AndroidVR 2x,AndroidVR 4x"), "set_msaa", "get_msaa"); ADD_PROPERTY(PropertyInfo(Variant::INT, "screen_space_aa", PROPERTY_HINT_ENUM, "Disabled,FXAA"), "set_screen_space_aa", "get_screen_space_aa"); ADD_PROPERTY(PropertyInfo(Variant::BOOL, "use_debanding"), "set_use_debanding", "is_using_debanding"); + ADD_PROPERTY(PropertyInfo(Variant::BOOL, "use_occlusion_culling"), "set_use_occlusion_culling", "is_using_occlusion_culling"); ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "lod_threshold", PROPERTY_HINT_RANGE, "0,1024,0.1"), "set_lod_threshold", "get_lod_threshold"); ADD_PROPERTY(PropertyInfo(Variant::INT, "debug_draw", PROPERTY_HINT_ENUM, "Disabled,Unshaded,Overdraw,Wireframe"), "set_debug_draw", "get_debug_draw"); ADD_GROUP("Canvas Items", "canvas_item_"); @@ -3655,6 +3671,7 @@ void Viewport::_bind_methods() { BIND_ENUM_CONSTANT(DEBUG_DRAW_CLUSTER_SPOT_LIGHTS); BIND_ENUM_CONSTANT(DEBUG_DRAW_CLUSTER_DECALS); BIND_ENUM_CONSTANT(DEBUG_DRAW_CLUSTER_REFLECTION_PROBES); + BIND_ENUM_CONSTANT(DEBUG_DRAW_OCCLUDERS) BIND_ENUM_CONSTANT(DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_NEAREST); BIND_ENUM_CONSTANT(DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_LINEAR); diff --git a/scene/main/viewport.h b/scene/main/viewport.h index e8a88debf11..6786b70a6b8 100644 --- a/scene/main/viewport.h +++ b/scene/main/viewport.h @@ -147,6 +147,7 @@ public: DEBUG_DRAW_CLUSTER_SPOT_LIGHTS, DEBUG_DRAW_CLUSTER_DECALS, DEBUG_DRAW_CLUSTER_REFLECTION_PROBES, + DEBUG_DRAW_OCCLUDERS, }; enum DefaultCanvasItemTextureFilter { @@ -304,6 +305,7 @@ private: ScreenSpaceAA screen_space_aa = SCREEN_SPACE_AA_DISABLED; bool use_debanding = false; float lod_threshold = 1.0; + bool use_occlusion_culling = false; Ref default_texture; Set viewport_textures; @@ -480,7 +482,6 @@ protected: void _notification(int p_what); void _process_picking(); static void _bind_methods(); - virtual void _validate_property(PropertyInfo &property) const override; public: uint64_t get_processed_events_count() const { return event_count; } @@ -556,6 +557,9 @@ public: void set_lod_threshold(float p_pixels); float get_lod_threshold() const; + void set_use_occlusion_culling(bool p_us_occlusion_culling); + bool is_using_occlusion_culling() const; + Vector2 get_camera_coords(const Vector2 &p_viewport_coords) const; Vector2 get_camera_rect_size() const; diff --git a/scene/register_scene_types.cpp b/scene/register_scene_types.cpp index 232ad278ddb..1b3be13039c 100644 --- a/scene/register_scene_types.cpp +++ b/scene/register_scene_types.cpp @@ -208,6 +208,7 @@ #include "scene/3d/navigation_agent_3d.h" #include "scene/3d/navigation_obstacle_3d.h" #include "scene/3d/navigation_region_3d.h" +#include "scene/3d/occluder_instance_3d.h" #include "scene/3d/path_3d.h" #include "scene/3d/physics_body_3d.h" #include "scene/3d/physics_joint_3d.h" @@ -442,6 +443,8 @@ void register_scene_types() { ClassDB::register_class(); ClassDB::register_class(); ClassDB::register_class(); + ClassDB::register_class(); + ClassDB::register_class(); ClassDB::register_class(); ClassDB::register_virtual_class(); ClassDB::register_class(); diff --git a/scene/resources/world_3d.cpp b/scene/resources/world_3d.cpp index f067771d587..e811cbf57ae 100644 --- a/scene/resources/world_3d.cpp +++ b/scene/resources/world_3d.cpp @@ -321,7 +321,7 @@ void World3D::_bind_methods() { ClassDB::bind_method(D_METHOD("get_environment"), &World3D::get_environment); ClassDB::bind_method(D_METHOD("set_fallback_environment", "env"), &World3D::set_fallback_environment); ClassDB::bind_method(D_METHOD("get_fallback_environment"), &World3D::get_fallback_environment); - ClassDB::bind_method(D_METHOD("set_camera_effects", "env"), &World3D::set_camera_effects); + ClassDB::bind_method(D_METHOD("set_camera_effects", "effects"), &World3D::set_camera_effects); ClassDB::bind_method(D_METHOD("get_camera_effects"), &World3D::get_camera_effects); ClassDB::bind_method(D_METHOD("get_direct_space_state"), &World3D::get_direct_space_state); ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "environment", PROPERTY_HINT_RESOURCE_TYPE, "Environment"), "set_environment", "get_environment"); diff --git a/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp b/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp index a98f67f163e..a742c4cc28a 100644 --- a/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp +++ b/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp @@ -1873,7 +1873,7 @@ void RendererSceneRenderRD::_render_buffers_post_process_and_tonemap(RID p_rende storage->render_target_disable_clear_request(rb->render_target); } -void RendererSceneRenderRD::_render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas) { +void RendererSceneRenderRD::_render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas, RID p_occlusion_buffer) { EffectsRD *effects = storage->get_effects(); RenderBuffers *rb = render_buffers_owner.getornull(p_render_buffers); @@ -1932,6 +1932,13 @@ void RendererSceneRenderRD::_render_buffers_debug_draw(RID p_render_buffers, RID RID reflection_texture = rb->reflection_buffer; effects->copy_to_fb_rect(ambient_texture, storage->render_target_get_rd_framebuffer(rb->render_target), Rect2(Vector2(), rtsize), false, false, false, true, reflection_texture); } + + if (debug_draw == RS::VIEWPORT_DEBUG_DRAW_OCCLUDERS) { + if (p_occlusion_buffer.is_valid()) { + Size2 rtsize = storage->render_target_get_size(rb->render_target); + effects->copy_to_fb_rect(storage->texture_get_rd_texture(p_occlusion_buffer), storage->render_target_get_rd_framebuffer(rb->render_target), Rect2i(Vector2(), rtsize), true, false); + } + } } void RendererSceneRenderRD::environment_set_adjustment(RID p_env, bool p_enable, float p_brightness, float p_contrast, float p_saturation, bool p_use_1d_color_correction, RID p_color_correction) { @@ -3516,7 +3523,7 @@ void RendererSceneRenderRD::_pre_opaque_render(bool p_use_ssao, bool p_use_gi, R } } -void RendererSceneRenderRD::render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray &p_instances, const PagedArray &p_lights, const PagedArray &p_reflection_probes, const PagedArray &p_gi_probes, const PagedArray &p_decals, const PagedArray &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data) { +void RendererSceneRenderRD::render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray &p_instances, const PagedArray &p_lights, const PagedArray &p_reflection_probes, const PagedArray &p_gi_probes, const PagedArray &p_decals, const PagedArray &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_occluder_debug_tex, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data) { // getting this here now so we can direct call a bunch of things more easily RenderBuffers *rb = nullptr; if (p_render_buffers.is_valid()) { @@ -3643,7 +3650,7 @@ void RendererSceneRenderRD::render_scene(RID p_render_buffers, const Transform & RENDER_TIMESTAMP("Tonemap"); _render_buffers_post_process_and_tonemap(p_render_buffers, p_environment, p_camera_effects, p_cam_projection); - _render_buffers_debug_draw(p_render_buffers, p_shadow_atlas); + _render_buffers_debug_draw(p_render_buffers, p_shadow_atlas, p_occluder_debug_tex); if (debug_draw == RS::VIEWPORT_DEBUG_DRAW_SDFGI && rb != nullptr && rb->sdfgi != nullptr) { rb->sdfgi->debug_draw(p_cam_projection, p_cam_transform, rb->width, rb->height, rb->render_target, rb->texture); } diff --git a/servers/rendering/renderer_rd/renderer_scene_render_rd.h b/servers/rendering/renderer_rd/renderer_scene_render_rd.h index 4bf0818206e..8c01b69b910 100644 --- a/servers/rendering/renderer_rd/renderer_scene_render_rd.h +++ b/servers/rendering/renderer_rd/renderer_scene_render_rd.h @@ -441,7 +441,7 @@ private: void _allocate_blur_textures(RenderBuffers *rb); void _allocate_luminance_textures(RenderBuffers *rb); - void _render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas); + void _render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas, RID p_occlusion_buffer); void _render_buffers_post_process_and_tonemap(RID p_render_buffers, RID p_environment, RID p_camera_effects, const CameraMatrix &p_projection); /* Cluster */ @@ -1125,7 +1125,7 @@ public: float render_buffers_get_volumetric_fog_end(RID p_render_buffers); float render_buffers_get_volumetric_fog_detail_spread(RID p_render_buffers); - void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray &p_instances, const PagedArray &p_lights, const PagedArray &p_reflection_probes, const PagedArray &p_gi_probes, const PagedArray &p_decals, const PagedArray &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr); + void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray &p_instances, const PagedArray &p_lights, const PagedArray &p_reflection_probes, const PagedArray &p_gi_probes, const PagedArray &p_decals, const PagedArray &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_occluder_debug_tex, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr); void render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray &p_instances, RID p_framebuffer, const Rect2i &p_region); diff --git a/servers/rendering/renderer_scene.h b/servers/rendering/renderer_scene.h index 551d4f4240a..db1e3d1377e 100644 --- a/servers/rendering/renderer_scene.h +++ b/servers/rendering/renderer_scene.h @@ -49,6 +49,10 @@ public: virtual void camera_set_use_vertical_aspect(RID p_camera, bool p_enable) = 0; virtual bool is_camera(RID p_camera) const = 0; + virtual RID occluder_allocate() = 0; + virtual void occluder_initialize(RID p_occluder) = 0; + virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) = 0; + virtual RID scenario_allocate() = 0; virtual void scenario_initialize(RID p_rid) = 0; @@ -197,8 +201,8 @@ public: virtual void sdfgi_set_debug_probe_select(const Vector3 &p_position, const Vector3 &p_dir) = 0; virtual void render_empty_scene(RID p_render_buffers, RID p_scenario, RID p_shadow_atlas) = 0; - virtual void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0; - virtual void render_camera(RID p_render_buffers, Ref &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0; + virtual void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0; + virtual void render_camera(RID p_render_buffers, Ref &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0; virtual void update() = 0; virtual void render_probes() = 0; diff --git a/servers/rendering/renderer_scene_cull.cpp b/servers/rendering/renderer_scene_cull.cpp index c7caf71fcca..5417695ad32 100644 --- a/servers/rendering/renderer_scene_cull.cpp +++ b/servers/rendering/renderer_scene_cull.cpp @@ -109,6 +109,20 @@ bool RendererSceneCull::is_camera(RID p_camera) const { return camera_owner.owns(p_camera); } +/* OCCLUDER API */ + +RID RendererSceneCull::occluder_allocate() { + return RendererSceneOcclusionCull::get_singleton()->occluder_allocate(); +} + +void RendererSceneCull::occluder_initialize(RID p_rid) { + RendererSceneOcclusionCull::get_singleton()->occluder_initialize(p_rid); +} + +void RendererSceneCull::occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) { + RendererSceneOcclusionCull::get_singleton()->occluder_set_mesh(p_occluder, p_vertices, p_indices); +} + /* SCENARIO API */ void RendererSceneCull::_instance_pair(Instance *p_A, Instance *p_B) { @@ -310,6 +324,8 @@ void RendererSceneCull::scenario_initialize(RID p_rid) { scenario->instance_aabbs.set_page_pool(&instance_aabb_page_pool); scenario->instance_data.set_page_pool(&instance_data_page_pool); + RendererSceneOcclusionCull::get_singleton()->add_scenario(p_rid); + scenario_owner.initialize_rid(p_rid, scenario); } @@ -497,6 +513,11 @@ void RendererSceneCull::instance_set_base(RID p_instance, RID p_base) { scene_render->free(gi_probe->probe_instance); } break; + case RS::INSTANCE_OCCLUDER: { + if (scenario && instance->visible) { + RendererSceneOcclusionCull::get_singleton()->scenario_remove_instance(instance->scenario->self, p_instance); + } + } break; default: { } } @@ -514,6 +535,11 @@ void RendererSceneCull::instance_set_base(RID p_instance, RID p_base) { if (p_base.is_valid()) { instance->base_type = RSG::storage->get_base_type(p_base); + + if (instance->base_type == RS::INSTANCE_NONE && RendererSceneOcclusionCull::get_singleton()->is_occluder(p_base)) { + instance->base_type = RS::INSTANCE_OCCLUDER; + } + ERR_FAIL_COND(instance->base_type == RS::INSTANCE_NONE); switch (instance->base_type) { @@ -588,6 +614,11 @@ void RendererSceneCull::instance_set_base(RID p_instance, RID p_base) { gi_probe->probe_instance = scene_render->gi_probe_instance_create(p_base); } break; + case RS::INSTANCE_OCCLUDER: { + if (scenario) { + RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(scenario->self, p_instance, p_base, instance->transform, instance->visible); + } + } break; default: { } } @@ -655,6 +686,11 @@ void RendererSceneCull::instance_set_scenario(RID p_instance, RID p_scenario) { gi_probe_update_list.remove(&gi_probe->update_element); } } break; + case RS::INSTANCE_OCCLUDER: { + if (instance->visible) { + RendererSceneOcclusionCull::get_singleton()->scenario_remove_instance(instance->scenario->self, p_instance); + } + } break; default: { } } @@ -684,6 +720,9 @@ void RendererSceneCull::instance_set_scenario(RID p_instance, RID p_scenario) { gi_probe_update_list.add(&gi_probe->update_element); } } break; + case RS::INSTANCE_OCCLUDER: { + RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(scenario->self, p_instance, instance->base, instance->transform, instance->visible); + } break; default: { } } @@ -801,6 +840,12 @@ void RendererSceneCull::instance_set_visible(RID p_instance, bool p_visible) { InstanceParticlesCollisionData *collision = static_cast(instance->base_data); RSG::storage->particles_collision_instance_set_active(collision->instance, p_visible); } + + if (instance->base_type == RS::INSTANCE_OCCLUDER) { + if (instance->scenario) { + RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(instance->scenario->self, p_instance, instance->base, instance->transform, p_visible); + } + } } inline bool is_geometry_instance(RenderingServer::InstanceType p_type) { @@ -998,6 +1043,18 @@ void RendererSceneCull::instance_geometry_set_flag(RID p_instance, RS::InstanceF } } break; + case RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING: { + instance->ignore_occlusion_culling = p_enabled; + + if (instance->scenario && instance->array_index >= 0) { + InstanceData &idata = instance->scenario->instance_data[instance->array_index]; + if (instance->ignore_occlusion_culling) { + idata.flags |= InstanceData::FLAG_IGNORE_OCCLUSION_CULLING; + } else { + idata.flags &= ~uint32_t(InstanceData::FLAG_IGNORE_OCCLUSION_CULLING); + } + } + } break; default: { } } @@ -1210,6 +1267,10 @@ void RendererSceneCull::_update_instance(Instance *p_instance) { heightfield_particle_colliders_update_list.insert(p_instance); } RSG::storage->particles_collision_instance_set_transform(collision->instance, p_instance->transform); + } else if (p_instance->base_type == RS::INSTANCE_OCCLUDER) { + if (p_instance->scenario) { + RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(p_instance->scenario->self, p_instance->self, p_instance->base, p_instance->transform, p_instance->visible); + } } if (p_instance->aabb.has_no_surface()) { @@ -1337,6 +1398,9 @@ void RendererSceneCull::_update_instance(Instance *p_instance) { if (p_instance->mesh_instance.is_valid()) { idata.flags |= InstanceData::FLAG_USES_MESH_INSTANCE; } + if (p_instance->ignore_occlusion_culling) { + idata.flags |= InstanceData::FLAG_IGNORE_OCCLUSION_CULLING; + } p_instance->scenario->instance_data.push_back(idata); p_instance->scenario->instance_aabbs.push_back(InstanceBounds(p_instance->transformed_aabb)); @@ -2119,7 +2183,7 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons return animated_material_found; } -void RendererSceneCull::render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) { +void RendererSceneCull::render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) { // render to mono camera #ifndef _3D_DISABLED @@ -2164,11 +2228,14 @@ void RendererSceneCull::render_camera(RID p_render_buffers, RID p_camera, RID p_ RID environment = _render_get_environment(p_camera, p_scenario); - _render_scene(camera->transform, camera_matrix, ortho, camera->vaspect, p_render_buffers, environment, camera->effects, camera->visible_layers, p_scenario, p_shadow_atlas, RID(), -1, p_screen_lod_threshold); + RENDER_TIMESTAMP("Update occlusion buffer") + RendererSceneOcclusionCull::get_singleton()->buffer_update(p_viewport, camera->transform, camera_matrix, ortho, RendererThreadPool::singleton->thread_work_pool); + + _render_scene(camera->transform, camera_matrix, ortho, camera->vaspect, p_render_buffers, environment, camera->effects, camera->visible_layers, p_scenario, p_viewport, p_shadow_atlas, RID(), -1, p_screen_lod_threshold); #endif } -void RendererSceneCull::render_camera(RID p_render_buffers, Ref &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) { +void RendererSceneCull::render_camera(RID p_render_buffers, Ref &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) { // render for AR/VR interface #if 0 Camera *camera = camera_owner.getornull(p_camera); @@ -2253,7 +2320,7 @@ void RendererSceneCull::render_camera(RID p_render_buffers, Ref &p_ #endif }; -void RendererSceneCull::_frustum_cull_threaded(uint32_t p_thread, FrustumCullData *cull_data) { +void RendererSceneCull::_frustum_cull_threaded(uint32_t p_thread, CullData *cull_data) { uint32_t cull_total = cull_data->scenario->instance_data.size(); uint32_t total_threads = RendererThreadPool::singleton->thread_work_pool.get_thread_count(); uint32_t cull_from = p_thread * cull_total / total_threads; @@ -2262,7 +2329,7 @@ void RendererSceneCull::_frustum_cull_threaded(uint32_t p_thread, FrustumCullDat _frustum_cull(*cull_data, frustum_cull_result_threads[p_thread], cull_from, cull_to); } -void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to) { +void RendererSceneCull::_frustum_cull(CullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to) { uint64_t frame_number = RSG::rasterizer->get_frame_number(); float lightmap_probe_update_speed = RSG::storage->lightmap_get_probe_capture_update_speed() * RSG::rasterizer->get_frame_delta_time(); @@ -2271,10 +2338,14 @@ void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullRes RID instance_pair_buffer[MAX_INSTANCE_PAIRS]; + Transform inv_cam_transform = cull_data.cam_transform.inverse(); + float z_near = cull_data.camera_matrix->get_z_near(); + for (uint64_t i = p_from; i < p_to; i++) { bool mesh_visible = false; - if (cull_data.scenario->instance_aabbs[i].in_frustum(cull_data.cull->frustum)) { + if (cull_data.scenario->instance_aabbs[i].in_frustum(cull_data.cull->frustum) && (cull_data.occlusion_buffer == nullptr || cull_data.scenario->instance_data[i].flags & InstanceData::FLAG_IGNORE_OCCLUSION_CULLING || + !cull_data.occlusion_buffer->is_occluded(cull_data.scenario->instance_aabbs[i].bounds, cull_data.cam_transform.origin, inv_cam_transform, *cull_data.camera_matrix, z_near))) { InstanceData &idata = cull_data.scenario->instance_data[i]; uint32_t base_type = idata.flags & InstanceData::FLAG_BASE_TYPE_MASK; @@ -2469,7 +2540,7 @@ void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullRes } } -void RendererSceneCull::_render_scene(const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows) { +void RendererSceneCull::_render_scene(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_viewport, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows) { // Note, in stereo rendering: // - p_cam_transform will be a transform in the middle of our two eyes // - p_cam_projection is a wider frustrum that encompasses both eyes @@ -2566,7 +2637,7 @@ void RendererSceneCull::_render_scene(const Transform p_cam_transform, const Cam uint64_t cull_from = 0; uint64_t cull_to = scenario->instance_data.size(); - FrustumCullData cull_data; + CullData cull_data; //prepare for eventual thread usage cull_data.cull = &cull; @@ -2575,6 +2646,8 @@ void RendererSceneCull::_render_scene(const Transform p_cam_transform, const Cam cull_data.cam_transform = p_cam_transform; cull_data.visible_layers = p_visible_layers; cull_data.render_reflection_probe = render_reflection_probe; + cull_data.occlusion_buffer = RendererSceneOcclusionCull::get_singleton()->buffer_get_ptr(p_viewport); + cull_data.camera_matrix = &p_cam_projection; //#define DEBUG_CULL_TIME #ifdef DEBUG_CULL_TIME uint64_t time_from = OS::get_singleton()->get_ticks_usec(); @@ -2781,8 +2854,13 @@ void RendererSceneCull::_render_scene(const Transform p_cam_transform, const Cam } /* PROCESS GEOMETRY AND DRAW SCENE */ + RID occluders_tex; + if (p_viewport.is_valid()) { + occluders_tex = RSG::viewport->viewport_get_occluder_debug_texture(p_viewport); + } + RENDER_TIMESTAMP("Render Scene "); - scene_render->render_scene(p_render_buffers, p_cam_transform, p_cam_projection, p_cam_orthogonal, frustum_cull_result.geometry_instances, frustum_cull_result.light_instances, frustum_cull_result.reflections, frustum_cull_result.gi_probes, frustum_cull_result.decals, frustum_cull_result.lightmaps, p_environment, camera_effects, p_shadow_atlas, p_reflection_probe.is_valid() ? RID() : scenario->reflection_atlas, p_reflection_probe, p_reflection_probe_pass, p_screen_lod_threshold, render_shadow_data, max_shadows_used, render_sdfgi_data, cull.sdfgi.region_count, &sdfgi_update_data); + scene_render->render_scene(p_render_buffers, p_cam_transform, p_cam_projection, p_cam_orthogonal, frustum_cull_result.geometry_instances, frustum_cull_result.light_instances, frustum_cull_result.reflections, frustum_cull_result.gi_probes, frustum_cull_result.decals, frustum_cull_result.lightmaps, p_environment, camera_effects, p_shadow_atlas, occluders_tex, p_reflection_probe.is_valid() ? RID() : scenario->reflection_atlas, p_reflection_probe, p_reflection_probe_pass, p_screen_lod_threshold, render_shadow_data, max_shadows_used, render_sdfgi_data, cull.sdfgi.region_count, &sdfgi_update_data); for (uint32_t i = 0; i < max_shadows_used; i++) { render_shadow_data[i].instances.clear(); @@ -2829,7 +2907,7 @@ void RendererSceneCull::render_empty_scene(RID p_render_buffers, RID p_scenario, environment = scenario->fallback_environment; } RENDER_TIMESTAMP("Render Empty Scene "); - scene_render->render_scene(p_render_buffers, Transform(), CameraMatrix(), true, PagedArray(), PagedArray(), PagedArray(), PagedArray(), PagedArray(), PagedArray(), RID(), RID(), p_shadow_atlas, scenario->reflection_atlas, RID(), 0, 0, nullptr, 0, nullptr, 0, nullptr); + scene_render->render_scene(p_render_buffers, Transform(), CameraMatrix(), true, PagedArray(), PagedArray(), PagedArray(), PagedArray(), PagedArray(), PagedArray(), RID(), RID(), p_shadow_atlas, RID(), scenario->reflection_atlas, RID(), 0, 0, nullptr, 0, nullptr, 0, nullptr); #endif } @@ -2899,7 +2977,7 @@ bool RendererSceneCull::_render_reflection_probe_step(Instance *p_instance, int } RENDER_TIMESTAMP("Render Reflection Probe, Step " + itos(p_step)); - _render_scene(xform, cm, false, false, RID(), environment, RID(), RSG::storage->reflection_probe_get_cull_mask(p_instance->base), p_instance->scenario->self, shadow_atlas, reflection_probe->instance, p_step, lod_threshold, use_shadows); + _render_scene(xform, cm, false, false, RID(), environment, RID(), RSG::storage->reflection_probe_get_cull_mask(p_instance->base), p_instance->scenario->self, RID(), shadow_atlas, reflection_probe->instance, p_step, lod_threshold, use_shadows); } else { //do roughness postprocess step until it believes it's done @@ -3473,8 +3551,11 @@ bool RendererSceneCull::free(RID p_rid) { scene_render->free(scenario->reflection_probe_shadow_atlas); scene_render->free(scenario->reflection_atlas); scenario_owner.free(p_rid); + RendererSceneOcclusionCull::get_singleton()->remove_scenario(p_rid); memdelete(scenario); + } else if (RendererSceneOcclusionCull::get_singleton()->is_occluder(p_rid)) { + RendererSceneOcclusionCull::get_singleton()->free_occluder(p_rid); } else if (instance_owner.owns(p_rid)) { // delete the instance @@ -3543,6 +3624,8 @@ RendererSceneCull::RendererSceneCull() { indexer_update_iterations = GLOBAL_GET("rendering/limits/spatial_indexer/update_iterations_per_frame"); thread_cull_threshold = GLOBAL_GET("rendering/limits/spatial_indexer/threaded_cull_minimum_instances"); thread_cull_threshold = MAX(thread_cull_threshold, (uint32_t)RendererThreadPool::singleton->thread_work_pool.get_thread_count()); //make sure there is at least one thread per CPU + + dummy_occlusion_culling = memnew(RendererSceneOcclusionCull); } RendererSceneCull::~RendererSceneCull() { @@ -3561,4 +3644,8 @@ RendererSceneCull::~RendererSceneCull() { frustum_cull_result_threads[i].reset(); } frustum_cull_result_threads.clear(); + + if (dummy_occlusion_culling) { + memdelete(dummy_occlusion_culling); + } } diff --git a/servers/rendering/renderer_scene_cull.h b/servers/rendering/renderer_scene_cull.h index d7d59665ecb..36eece9dd8e 100644 --- a/servers/rendering/renderer_scene_cull.h +++ b/servers/rendering/renderer_scene_cull.h @@ -45,8 +45,10 @@ #include "core/templates/rid_owner.h" #include "core/templates/self_list.h" #include "servers/rendering/renderer_scene.h" +#include "servers/rendering/renderer_scene_occlusion_cull.h" #include "servers/rendering/renderer_scene_render.h" #include "servers/xr/xr_interface.h" + class RendererSceneCull : public RendererScene { public: RendererSceneRender *scene_render; @@ -109,6 +111,14 @@ public: virtual void camera_set_use_vertical_aspect(RID p_camera, bool p_enable); virtual bool is_camera(RID p_camera) const; + /* OCCLUDER API */ + + virtual RID occluder_allocate(); + virtual void occluder_initialize(RID p_occluder); + virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices); + + RendererSceneOcclusionCull *dummy_occlusion_culling; + /* SCENARIO API */ struct Instance; @@ -248,6 +258,7 @@ public: FLAG_USES_BAKED_LIGHT = (1 << 16), FLAG_USES_MESH_INSTANCE = (1 << 17), FLAG_REFLECTION_PROBE_DIRTY = (1 << 18), + FLAG_IGNORE_OCCLUSION_CULLING = (1 << 19), }; uint32_t flags = 0; @@ -346,6 +357,8 @@ public: float lod_bias; + bool ignore_occlusion_culling; + Vector materials; RS::ShadowCastingSetting cast_shadows; @@ -470,6 +483,7 @@ public: lightmap = nullptr; lightmap_cull_index = 0; lod_bias = 1.0; + ignore_occlusion_culling = false; scenario = nullptr; @@ -921,24 +935,26 @@ public: Frustum frustum; } cull; - struct FrustumCullData { + struct CullData { Cull *cull; Scenario *scenario; RID shadow_atlas; Transform cam_transform; uint32_t visible_layers; Instance *render_reflection_probe; + const RendererSceneOcclusionCull::HZBuffer *occlusion_buffer; + const CameraMatrix *camera_matrix; }; - void _frustum_cull_threaded(uint32_t p_thread, FrustumCullData *cull_data); - void _frustum_cull(FrustumCullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to); + void _frustum_cull_threaded(uint32_t p_thread, CullData *cull_data); + void _frustum_cull(CullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to); bool _render_reflection_probe_step(Instance *p_instance, int p_step); - void _render_scene(const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows = true); + void _render_scene(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_viewport, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows = true); void render_empty_scene(RID p_render_buffers, RID p_scenario, RID p_shadow_atlas); - void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas); - void render_camera(RID p_render_buffers, Ref &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas); + void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas); + void render_camera(RID p_render_buffers, Ref &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas); void update_dirty_instances(); void render_particle_colliders(); diff --git a/servers/rendering/renderer_scene_occlusion_cull.cpp b/servers/rendering/renderer_scene_occlusion_cull.cpp new file mode 100644 index 00000000000..c491ccbe7a1 --- /dev/null +++ b/servers/rendering/renderer_scene_occlusion_cull.cpp @@ -0,0 +1,192 @@ +/*************************************************************************/ +/* renderer_scene_occlusion_cull.cpp */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#include "renderer_scene_occlusion_cull.h" + +RendererSceneOcclusionCull *RendererSceneOcclusionCull::singleton = nullptr; + +const Vector3 RendererSceneOcclusionCull::HZBuffer::corners[8] = { + Vector3(0, 0, 0), + Vector3(0, 0, 1), + Vector3(0, 1, 0), + Vector3(0, 1, 1), + Vector3(1, 0, 0), + Vector3(1, 0, 1), + Vector3(1, 1, 0), + Vector3(1, 1, 1) +}; + +bool RendererSceneOcclusionCull::HZBuffer::is_empty() const { + return sizes.is_empty(); +} + +void RendererSceneOcclusionCull::HZBuffer::clear() { + if (sizes.is_empty()) { + return; // Already cleared + } + + data.clear(); + sizes.clear(); + mips.clear(); + + debug_data.clear(); + if (debug_image.is_valid()) { + debug_image.unref(); + } + RS::get_singleton()->free(debug_texture); +} + +void RendererSceneOcclusionCull::HZBuffer::resize(const Size2i &p_size) { + if (p_size == Size2i()) { + clear(); + return; + } + + if (!sizes.is_empty() && p_size == sizes[0]) { + return; // Size didn't change + } + + int mip_count = 0; + int data_size = 0; + int w = p_size.x; + int h = p_size.y; + + while (true) { + data_size += h * w; + + w = MAX(1, w >> 1); + h = MAX(1, h >> 1); + + mip_count++; + + if (w == 1U && h == 1U) { + data_size += 1U; + mip_count++; + break; + } + } + + data.resize(data_size); + mips.resize(mip_count); + sizes.resize(mip_count); + + w = p_size.x; + h = p_size.y; + float *ptr = data.ptr(); + + for (int i = 0; i < mip_count; i++) { + sizes[i] = Size2i(w, h); + mips[i] = ptr; + + ptr = &ptr[w * h]; + w = MAX(1, w >> 1); + h = MAX(1, h >> 1); + } + + for (int i = 0; i < data_size; i++) { + data[i] = FLT_MAX; + } + + debug_data.resize(sizes[0].x * sizes[0].y); + if (debug_texture.is_valid()) { + RS::get_singleton()->free(debug_texture); + debug_texture = RID(); + } +} + +void RendererSceneOcclusionCull::HZBuffer::update_mips() { + if (sizes.is_empty()) { + return; + } + + for (uint32_t mip = 1; mip < mips.size(); mip++) { + for (int y = 0; y < sizes[mip].y; y++) { + for (int x = 0; x < sizes[mip].x; x++) { + int prev_x = x * 2; + int prev_y = y * 2; + + int prev_w = sizes[mip - 1].width; + int prev_h = sizes[mip - 1].height; + + bool odd_w = (prev_w % 2) != 0; + bool odd_h = (prev_h % 2) != 0; + +#define CHECK_OFFSET(xx, yy) max_depth = MAX(max_depth, mips[mip - 1][MIN(prev_h - 1, prev_y + (yy)) * prev_w + MIN(prev_w - 1, prev_x + (xx))]) + + float max_depth = mips[mip - 1][prev_y * sizes[mip - 1].x + prev_x]; + CHECK_OFFSET(0, 1); + CHECK_OFFSET(1, 0); + CHECK_OFFSET(1, 1); + + if (odd_w) { + CHECK_OFFSET(2, 0); + CHECK_OFFSET(2, 1); + } + + if (odd_h) { + CHECK_OFFSET(0, 2); + CHECK_OFFSET(1, 2); + } + + if (odd_w && odd_h) { + CHECK_OFFSET(2, 2); + } + + mips[mip][y * sizes[mip].x + x] = max_depth; +#undef CHECK_OFFSET + } + } + } +} + +RID RendererSceneOcclusionCull::HZBuffer::get_debug_texture() { + if (sizes.is_empty() || sizes[0] == Size2i()) { + return RID(); + } + + if (debug_image.is_null()) { + debug_image.instance(); + } + + unsigned char *ptrw = debug_data.ptrw(); + for (int i = 0; i < debug_data.size(); i++) { + ptrw[i] = MIN(mips[0][i] / debug_tex_range, 1.0) * 255; + } + + debug_image->create(sizes[0].x, sizes[0].y, false, Image::FORMAT_L8, debug_data); + + if (debug_texture.is_null()) { + debug_texture = RS::get_singleton()->texture_2d_create(debug_image); + } else { + RenderingServer::get_singleton()->texture_2d_update_immediate(debug_texture, debug_image); + } + + return debug_texture; +} diff --git a/servers/rendering/renderer_scene_occlusion_cull.h b/servers/rendering/renderer_scene_occlusion_cull.h new file mode 100644 index 00000000000..390bbaa64bd --- /dev/null +++ b/servers/rendering/renderer_scene_occlusion_cull.h @@ -0,0 +1,201 @@ +/*************************************************************************/ +/* renderer_scene_occlusion_cull.h */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#ifndef RENDERER_SCENE_OCCLUSION_CULL_H +#define RENDERER_SCENE_OCCLUSION_CULL_H + +#include "core/math/camera_matrix.h" +#include "core/templates/local_vector.h" +#include "servers/rendering_server.h" + +class RendererSceneOcclusionCull { +protected: + static RendererSceneOcclusionCull *singleton; + +public: + class HZBuffer { + protected: + static const Vector3 corners[8]; + + LocalVector data; + LocalVector sizes; + LocalVector mips; + + RID debug_texture; + Ref debug_image; + PackedByteArray debug_data; + float debug_tex_range = 0.0f; + + public: + bool is_empty() const; + virtual void clear(); + virtual void resize(const Size2i &p_size); + + void update_mips(); + + _FORCE_INLINE_ bool is_occluded(const float p_bounds[6], const Vector3 &p_cam_position, const Transform &p_cam_inv_transform, const CameraMatrix &p_cam_projection, float p_near) const { + if (is_empty()) { + return false; + } + + Vector3 closest_point = Vector3(CLAMP(p_cam_position.x, p_bounds[0], p_bounds[3]), CLAMP(p_cam_position.y, p_bounds[1], p_bounds[4]), CLAMP(p_cam_position.z, p_bounds[2], p_bounds[5])); + + if (closest_point == p_cam_position) { + return false; + } + + Vector3 closest_point_view = p_cam_inv_transform.xform(closest_point); + if (closest_point_view.z > -p_near) { + return false; + } + + float min_depth; + if (p_cam_projection.is_orthogonal()) { + min_depth = (-closest_point_view.z) - p_near; + } else { + float r = -p_near / closest_point_view.z; + Vector3 closest_point_proj = Vector3(closest_point_view.x * r, closest_point_view.y * r, -p_near); + min_depth = closest_point_proj.distance_to(closest_point_view); + } + + Vector2 rect_min = Vector2(FLT_MAX, FLT_MAX); + Vector2 rect_max = Vector2(FLT_MIN, FLT_MIN); + + for (int j = 0; j < 8; j++) { + Vector3 c = RendererSceneOcclusionCull::HZBuffer::corners[j]; + Vector3 nc = Vector3(1, 1, 1) - c; + Vector3 corner = Vector3(p_bounds[0] * c.x + p_bounds[3] * nc.x, p_bounds[1] * c.y + p_bounds[4] * nc.y, p_bounds[2] * c.z + p_bounds[5] * nc.z); + Vector3 view = p_cam_inv_transform.xform(corner); + + Vector3 projected = p_cam_projection.xform(view); + Vector2 normalized = Vector2(projected.x * 0.5f + 0.5f, projected.y * 0.5f + 0.5f); + rect_min = rect_min.min(normalized); + rect_max = rect_max.max(normalized); + } + + rect_max = rect_max.min(Vector2(1, 1)); + rect_min = rect_min.max(Vector2(0, 0)); + + int mip_count = mips.size(); + + Vector2 screen_diagonal = (rect_max - rect_min) * sizes[0]; + float size = MAX(screen_diagonal.x, screen_diagonal.y); + float l = Math::ceil(Math::log2(size)); + int lod = CLAMP(l, 0, mip_count - 1); + + const int max_samples = 512; + int sample_count = 0; + bool visible = true; + + for (; lod >= 0; lod--) { + int w = sizes[lod].x; + int h = sizes[lod].y; + + int minx = CLAMP(rect_min.x * w - 1, 0, w - 1); + int maxx = CLAMP(rect_max.x * w + 1, 0, w - 1); + + int miny = CLAMP(rect_min.y * h - 1, 0, h - 1); + int maxy = CLAMP(rect_max.y * h + 1, 0, h - 1); + + sample_count += (maxx - minx + 1) * (maxy - miny + 1); + + if (sample_count > max_samples) { + return false; + } + + visible = false; + for (int y = miny; y <= maxy; y++) { + for (int x = minx; x <= maxx; x++) { + float depth = mips[lod][y * w + x]; + if (depth > min_depth) { + visible = true; + break; + } + } + if (visible) { + break; + } + } + + if (!visible) { + return true; + } + } + + return !visible; + } + + RID get_debug_texture(); + + virtual ~HZBuffer(){}; + }; + + static RendererSceneOcclusionCull *get_singleton() { return singleton; } + + void _print_warining() { + WARN_PRINT_ONCE("Occlusion culling is disabled at build time."); + } + + virtual bool is_occluder(RID p_rid) { return false; } + virtual RID occluder_allocate() { return RID(); } + virtual void occluder_initialize(RID p_occluder) {} + virtual void free_occluder(RID p_occluder) { _print_warining(); } + virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) { _print_warining(); } + + virtual void add_scenario(RID p_scenario) {} + virtual void remove_scenario(RID p_scenario) {} + virtual void scenario_set_instance(RID p_scenario, RID p_instance, RID p_occluder, const Transform &p_xform, bool p_enabled) { _print_warining(); } + virtual void scenario_remove_instance(RID p_scenario, RID p_instance) { _print_warining(); } + + virtual void add_buffer(RID p_buffer) { _print_warining(); } + virtual void remove_buffer(RID p_buffer) { _print_warining(); } + virtual HZBuffer *buffer_get_ptr(RID p_buffer) { + return nullptr; + } + virtual void buffer_set_scenario(RID p_buffer, RID p_scenario) { _print_warining(); } + virtual void buffer_set_size(RID p_buffer, const Vector2i &p_size) { _print_warining(); } + virtual void buffer_update(RID p_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) {} + virtual RID buffer_get_debug_texture(RID p_buffer) { + _print_warining(); + return RID(); + } + + virtual void set_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) {} + + RendererSceneOcclusionCull() { + singleton = this; + }; + + virtual ~RendererSceneOcclusionCull() { + singleton = nullptr; + }; +}; + +#endif //RENDERER_SCENE_OCCLUSION_CULL_H diff --git a/servers/rendering/renderer_scene_render.h b/servers/rendering/renderer_scene_render.h index 9ca9574f6f2..3f28fac5499 100644 --- a/servers/rendering/renderer_scene_render.h +++ b/servers/rendering/renderer_scene_render.h @@ -216,7 +216,7 @@ public: uint32_t positional_light_count; }; - virtual void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray &p_instances, const PagedArray &p_lights, const PagedArray &p_reflection_probes, const PagedArray &p_gi_probes, const PagedArray &p_decals, const PagedArray &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr) = 0; + virtual void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray &p_instances, const PagedArray &p_lights, const PagedArray &p_reflection_probes, const PagedArray &p_gi_probes, const PagedArray &p_decals, const PagedArray &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_occluder_debug_tex, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr) = 0; virtual void render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray &p_instances, RID p_framebuffer, const Rect2i &p_region) = 0; virtual void render_particle_collider_heightfield(RID p_collider, const Transform &p_transform, const PagedArray &p_instances) = 0; diff --git a/servers/rendering/renderer_viewport.cpp b/servers/rendering/renderer_viewport.cpp index a5d5033c186..f7be6c6c60e 100644 --- a/servers/rendering/renderer_viewport.cpp +++ b/servers/rendering/renderer_viewport.cpp @@ -79,11 +79,26 @@ void RendererViewport::_draw_3d(Viewport *p_viewport, XRInterface::Eyes p_eye) { xr_interface = XRServer::get_singleton()->get_primary_interface(); } + if (p_viewport->use_occlusion_culling) { + if (p_viewport->occlusion_buffer_dirty) { + float aspect = p_viewport->size.aspect(); + int max_size = occlusion_rays_per_thread * RendererThreadPool::singleton->thread_work_pool.get_thread_count(); + + int viewport_size = p_viewport->size.width * p_viewport->size.height; + max_size = CLAMP(max_size, viewport_size / (32 * 32), viewport_size / (2 * 2)); // At least one depth pixel for every 16x16 region. At most one depth pixel for every 2x2 region. + + float height = Math::sqrt(max_size / aspect); + Size2i new_size = Size2i(height * aspect, height); + RendererSceneOcclusionCull::get_singleton()->buffer_set_size(p_viewport->self, new_size); + p_viewport->occlusion_buffer_dirty = false; + } + } + float screen_lod_threshold = p_viewport->lod_threshold / float(p_viewport->size.width); if (p_viewport->use_xr && xr_interface.is_valid()) { - RSG::scene->render_camera(p_viewport->render_buffers, xr_interface, p_eye, p_viewport->camera, p_viewport->scenario, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas); + RSG::scene->render_camera(p_viewport->render_buffers, xr_interface, p_eye, p_viewport->camera, p_viewport->scenario, p_viewport->self, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas); } else { - RSG::scene->render_camera(p_viewport->render_buffers, p_viewport->camera, p_viewport->scenario, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas); + RSG::scene->render_camera(p_viewport->render_buffers, p_viewport->camera, p_viewport->scenario, p_viewport->self, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas); } RENDER_TIMESTAMP("render_buffers_configure(viewport->render_buffers, viewport->render_target, viewport->size.width, viewport->size.height, viewport->msaa, viewport->screen_space_aa, viewport->use_debanding); } } + + viewport->occlusion_buffer_dirty = true; } void RendererViewport::viewport_set_active(RID p_viewport, bool p_active) { @@ -655,6 +672,7 @@ void RendererViewport::viewport_set_active(RID p_viewport, bool p_active) { if (p_active) { ERR_FAIL_COND(active_viewports.find(viewport) != -1); //already active + viewport->occlusion_buffer_dirty = true; active_viewports.push_back(viewport); } else { active_viewports.erase(viewport); @@ -739,6 +757,16 @@ RID RendererViewport::viewport_get_texture(RID p_viewport) const { return RSG::storage->render_target_get_texture(viewport->render_target); } +RID RendererViewport::viewport_get_occluder_debug_texture(RID p_viewport) const { + const Viewport *viewport = viewport_owner.getornull(p_viewport); + ERR_FAIL_COND_V(!viewport, RID()); + + if (viewport->use_occlusion_culling && viewport->debug_draw == RenderingServer::VIEWPORT_DEBUG_DRAW_OCCLUDERS) { + return RendererSceneOcclusionCull::get_singleton()->buffer_get_debug_texture(p_viewport); + } + return RID(); +} + void RendererViewport::viewport_set_hide_scenario(RID p_viewport, bool p_hide) { Viewport *viewport = viewport_owner.getornull(p_viewport); ERR_FAIL_COND(!viewport); @@ -772,6 +800,9 @@ void RendererViewport::viewport_set_scenario(RID p_viewport, RID p_scenario) { ERR_FAIL_COND(!viewport); viewport->scenario = p_scenario; + if (viewport->use_occlusion_culling) { + RendererSceneOcclusionCull::get_singleton()->buffer_set_scenario(p_viewport, p_scenario); + } } void RendererViewport::viewport_attach_canvas(RID p_viewport, RID p_canvas) { @@ -888,6 +919,41 @@ void RendererViewport::viewport_set_use_debanding(RID p_viewport, bool p_use_deb } } +void RendererViewport::viewport_set_use_occlusion_culling(RID p_viewport, bool p_use_occlusion_culling) { + Viewport *viewport = viewport_owner.getornull(p_viewport); + ERR_FAIL_COND(!viewport); + + if (viewport->use_occlusion_culling == p_use_occlusion_culling) { + return; + } + viewport->use_occlusion_culling = p_use_occlusion_culling; + + if (viewport->use_occlusion_culling) { + RendererSceneOcclusionCull::get_singleton()->add_buffer(p_viewport); + RendererSceneOcclusionCull::get_singleton()->buffer_set_scenario(p_viewport, viewport->scenario); + } else { + RendererSceneOcclusionCull::get_singleton()->remove_buffer(p_viewport); + } + + viewport->occlusion_buffer_dirty = true; +} + +void RendererViewport::viewport_set_occlusion_rays_per_thread(int p_rays_per_thread) { + if (occlusion_rays_per_thread == p_rays_per_thread) { + return; + } + + occlusion_rays_per_thread = p_rays_per_thread; + + for (int i = 0; i < active_viewports.size(); i++) { + active_viewports[i]->occlusion_buffer_dirty = true; + } +} + +void RendererViewport::viewport_set_occlusion_culling_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) { + RendererSceneOcclusionCull::get_singleton()->set_build_quality(p_quality); +} + void RendererViewport::viewport_set_lod_threshold(RID p_viewport, float p_pixels) { Viewport *viewport = viewport_owner.getornull(p_viewport); ERR_FAIL_COND(!viewport); @@ -985,6 +1051,10 @@ bool RendererViewport::free(RID p_rid) { viewport_set_scenario(p_rid, RID()); active_viewports.erase(viewport); + if (viewport->use_occlusion_culling) { + RendererSceneOcclusionCull::get_singleton()->remove_buffer(p_rid); + } + viewport_owner.free(p_rid); memdelete(viewport); @@ -1026,4 +1096,5 @@ void RendererViewport::call_set_use_vsync(bool p_enable) { } RendererViewport::RendererViewport() { + occlusion_rays_per_thread = GLOBAL_GET("rendering/occlusion_culling/occlusion_rays_per_thread"); } diff --git a/servers/rendering/renderer_viewport.h b/servers/rendering/renderer_viewport.h index f5ed543e8da..5c372e8c9ad 100644 --- a/servers/rendering/renderer_viewport.h +++ b/servers/rendering/renderer_viewport.h @@ -31,9 +31,9 @@ #ifndef VISUALSERVERVIEWPORT_H #define VISUALSERVERVIEWPORT_H +#include "core/templates/local_vector.h" #include "core/templates/rid_owner.h" #include "core/templates/self_list.h" -#include "renderer_compositor.h" #include "servers/rendering_server.h" #include "servers/xr/xr_interface.h" @@ -61,6 +61,9 @@ public: RS::ViewportScreenSpaceAA screen_space_aa; bool use_debanding; + bool use_occlusion_culling; + bool occlusion_buffer_dirty; + DisplayServer::WindowID viewport_to_screen; Rect2 viewport_to_screen_rect; bool viewport_render_direct_to_screen; @@ -143,6 +146,8 @@ public: msaa = RS::VIEWPORT_MSAA_DISABLED; screen_space_aa = RS::VIEWPORT_SCREEN_SPACE_AA_DISABLED; use_debanding = false; + use_occlusion_culling = false; + occlusion_buffer_dirty = true; snap_2d_transforms_to_pixel = false; snap_2d_vertices_to_pixel = false; @@ -185,6 +190,10 @@ private: void _draw_3d(Viewport *p_viewport, XRInterface::Eyes p_eye); void _draw_viewport(Viewport *p_viewport, XRInterface::Eyes p_eye = XRInterface::EYE_MONO); + int occlusion_rays_per_thread = 512; + + void _resize_occlusion_culling_buffer(const Size2i &p_size); + public: RID viewport_allocate(); void viewport_initialize(RID p_rid); @@ -204,6 +213,7 @@ public: void viewport_set_clear_mode(RID p_viewport, RS::ViewportClearMode p_clear_mode); RID viewport_get_texture(RID p_viewport) const; + RID viewport_get_occluder_debug_texture(RID p_viewport) const; void viewport_set_hide_scenario(RID p_viewport, bool p_hide); void viewport_set_hide_canvas(RID p_viewport, bool p_hide); @@ -225,7 +235,9 @@ public: void viewport_set_msaa(RID p_viewport, RS::ViewportMSAA p_msaa); void viewport_set_screen_space_aa(RID p_viewport, RS::ViewportScreenSpaceAA p_mode); void viewport_set_use_debanding(RID p_viewport, bool p_use_debanding); - + void viewport_set_use_occlusion_culling(RID p_viewport, bool p_use_occlusion_culling); + void viewport_set_occlusion_rays_per_thread(int p_rays_per_thread); + void viewport_set_occlusion_culling_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality); void viewport_set_lod_threshold(RID p_viewport, float p_pixels); virtual int viewport_get_render_info(RID p_viewport, RS::ViewportRenderInfo p_info); diff --git a/servers/rendering/rendering_server_default.h b/servers/rendering/rendering_server_default.h index 683a22fd9ae..324c002d6fe 100644 --- a/servers/rendering/rendering_server_default.h +++ b/servers/rendering/rendering_server_default.h @@ -540,6 +540,10 @@ public: FUNC2(camera_set_camera_effects, RID, RID) FUNC2(camera_set_use_vertical_aspect, RID, bool) + /* OCCLUDER */ + FUNCRIDSPLIT(occluder) + FUNC3(occluder_set_mesh, RID, const PackedVector3Array &, const PackedInt32Array &); + #undef server_name #undef ServerName //from now on, calls forwarded to this singleton @@ -590,6 +594,9 @@ public: FUNC2(viewport_set_msaa, RID, ViewportMSAA) FUNC2(viewport_set_screen_space_aa, RID, ViewportScreenSpaceAA) FUNC2(viewport_set_use_debanding, RID, bool) + FUNC2(viewport_set_use_occlusion_culling, RID, bool) + FUNC1(viewport_set_occlusion_rays_per_thread, int) + FUNC1(viewport_set_occlusion_culling_build_quality, ViewportOcclusionCullingBuildQuality) FUNC2(viewport_set_lod_threshold, RID, float) FUNC2R(int, viewport_get_render_info, RID, ViewportRenderInfo) diff --git a/servers/rendering_server.cpp b/servers/rendering_server.cpp index f8644b5ecb6..3b6838c9bae 100644 --- a/servers/rendering_server.cpp +++ b/servers/rendering_server.cpp @@ -1594,7 +1594,7 @@ void RenderingServer::_bind_methods() { ClassDB::bind_method(D_METHOD("gi_probe_set_compress", "probe", "enable"), &RenderingServer::gi_probe_set_compress); ClassDB::bind_method(D_METHOD("gi_probe_is_compressed", "probe"), &RenderingServer::gi_probe_is_compressed); #endif -/* + /* ClassDB::bind_method(D_METHOD("lightmap_create()"), &RenderingServer::lightmap_capture_create); ClassDB::bind_method(D_METHOD("lightmap_capture_set_bounds", "capture", "bounds"), &RenderingServer::lightmap_capture_set_bounds); ClassDB::bind_method(D_METHOD("lightmap_capture_get_bounds", "capture"), &RenderingServer::lightmap_capture_get_bounds); @@ -1607,6 +1607,10 @@ void RenderingServer::_bind_methods() { ClassDB::bind_method(D_METHOD("lightmap_capture_set_energy", "capture", "energy"), &RenderingServer::lightmap_capture_set_energy); ClassDB::bind_method(D_METHOD("lightmap_capture_get_energy", "capture"), &RenderingServer::lightmap_capture_get_energy); */ + + ClassDB::bind_method(D_METHOD("occluder_create"), &RenderingServer::occluder_create); + ClassDB::bind_method(D_METHOD("occluder_set_mesh"), &RenderingServer::occluder_set_mesh); + #endif ClassDB::bind_method(D_METHOD("particles_create"), &RenderingServer::particles_create); ClassDB::bind_method(D_METHOD("particles_set_emitting", "particles", "emitting"), &RenderingServer::particles_set_emitting); @@ -1667,6 +1671,9 @@ void RenderingServer::_bind_methods() { ClassDB::bind_method(D_METHOD("viewport_set_shadow_atlas_quadrant_subdivision", "viewport", "quadrant", "subdivision"), &RenderingServer::viewport_set_shadow_atlas_quadrant_subdivision); ClassDB::bind_method(D_METHOD("viewport_set_msaa", "viewport", "msaa"), &RenderingServer::viewport_set_msaa); ClassDB::bind_method(D_METHOD("viewport_set_use_debanding", "viewport", "enable"), &RenderingServer::viewport_set_use_debanding); + ClassDB::bind_method(D_METHOD("viewport_set_use_occlusion_culling", "viewport", "enable"), &RenderingServer::viewport_set_use_occlusion_culling); + ClassDB::bind_method(D_METHOD("viewport_set_occlusion_rays_per_thread", "rays_per_thread"), &RenderingServer::viewport_set_occlusion_rays_per_thread); + ClassDB::bind_method(D_METHOD("viewport_set_occlusion_culling_build_quality", "quality"), &RenderingServer::viewport_set_occlusion_culling_build_quality); ClassDB::bind_method(D_METHOD("viewport_get_render_info", "viewport", "info"), &RenderingServer::viewport_get_render_info); ClassDB::bind_method(D_METHOD("viewport_set_debug_draw", "viewport", "draw"), &RenderingServer::viewport_set_debug_draw); @@ -1694,6 +1701,7 @@ void RenderingServer::_bind_methods() { ClassDB::bind_method(D_METHOD("scenario_create"), &RenderingServer::scenario_create); ClassDB::bind_method(D_METHOD("scenario_set_debug", "scenario", "debug_mode"), &RenderingServer::scenario_set_debug); ClassDB::bind_method(D_METHOD("scenario_set_environment", "scenario", "environment"), &RenderingServer::scenario_set_environment); + ClassDB::bind_method(D_METHOD("scenario_set_camera_effects", "scenario", "effects"), &RenderingServer::scenario_set_camera_effects); ClassDB::bind_method(D_METHOD("scenario_set_fallback_environment", "scenario", "environment"), &RenderingServer::scenario_set_fallback_environment); #ifndef _3D_DISABLED @@ -2024,6 +2032,7 @@ void RenderingServer::_bind_methods() { BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_SDFGI); BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_SDFGI_PROBES); BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_GI_BUFFER); + BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_OCCLUDERS); BIND_ENUM_CONSTANT(SKY_MODE_QUALITY); BIND_ENUM_CONSTANT(SKY_MODE_REALTIME); @@ -2093,6 +2102,10 @@ void RenderingServer::_bind_methods() { BIND_ENUM_CONSTANT(SCENARIO_DEBUG_OVERDRAW); BIND_ENUM_CONSTANT(SCENARIO_DEBUG_SHADELESS); + BIND_ENUM_CONSTANT(VIEWPORT_OCCLUSION_BUILD_QUALITY_LOW); + BIND_ENUM_CONSTANT(VIEWPORT_OCCLUSION_BUILD_QUALITY_MEDIUM); + BIND_ENUM_CONSTANT(VIEWPORT_OCCLUSION_BUILD_QUALITY_HIGH); + BIND_ENUM_CONSTANT(INSTANCE_NONE); BIND_ENUM_CONSTANT(INSTANCE_MESH); BIND_ENUM_CONSTANT(INSTANCE_MULTIMESH); @@ -2104,12 +2117,14 @@ void RenderingServer::_bind_methods() { BIND_ENUM_CONSTANT(INSTANCE_DECAL); BIND_ENUM_CONSTANT(INSTANCE_GI_PROBE); BIND_ENUM_CONSTANT(INSTANCE_LIGHTMAP); + BIND_ENUM_CONSTANT(INSTANCE_OCCLUDER); BIND_ENUM_CONSTANT(INSTANCE_MAX); BIND_ENUM_CONSTANT(INSTANCE_GEOMETRY_MASK); BIND_ENUM_CONSTANT(INSTANCE_FLAG_USE_BAKED_LIGHT); BIND_ENUM_CONSTANT(INSTANCE_FLAG_USE_DYNAMIC_GI); BIND_ENUM_CONSTANT(INSTANCE_FLAG_DRAW_NEXT_FRAME_IF_VISIBLE); + BIND_ENUM_CONSTANT(INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING); BIND_ENUM_CONSTANT(INSTANCE_FLAG_MAX); BIND_ENUM_CONSTANT(SHADOW_CASTING_SETTING_OFF); @@ -2340,6 +2355,10 @@ RenderingServer::RenderingServer() { ProjectSettings::get_singleton()->set_custom_property_info("rendering/anti_aliasing/screen_space_roughness_limiter/amount", PropertyInfo(Variant::FLOAT, "rendering/anti_aliasing/screen_space_roughness_limiter/amount", PROPERTY_HINT_RANGE, "0.01,4.0,0.01")); ProjectSettings::get_singleton()->set_custom_property_info("rendering/anti_aliasing/screen_space_roughness_limiter/limit", PropertyInfo(Variant::FLOAT, "rendering/anti_aliasing/screen_space_roughness_limiter/limit", PROPERTY_HINT_RANGE, "0.01,1.0,0.01")); + GLOBAL_DEF_RST("rendering/occlusion_culling/occlusion_rays_per_thread", 512); + GLOBAL_DEF_RST("rendering/occlusion_culling/bvh_build_quality", 2); + ProjectSettings::get_singleton()->set_custom_property_info("rendering/occlusion_culling/bvh_build_quality", PropertyInfo(Variant::INT, "rendering/occlusion_culling/bvh_build_quality", PROPERTY_HINT_ENUM, "Low,Medium,High")); + GLOBAL_DEF("rendering/environment/glow/upscale_mode", 1); ProjectSettings::get_singleton()->set_custom_property_info("rendering/environment/glow/upscale_mode", PropertyInfo(Variant::INT, "rendering/environment/glow/upscale_mode", PROPERTY_HINT_ENUM, "Linear (Fast),Bicubic (Slow)")); GLOBAL_DEF("rendering/environment/glow/upscale_mode.mobile", 0); diff --git a/servers/rendering_server.h b/servers/rendering_server.h index 694fae7fde7..c74bb22aadf 100644 --- a/servers/rendering_server.h +++ b/servers/rendering_server.h @@ -713,6 +713,11 @@ public: virtual void camera_set_camera_effects(RID p_camera, RID p_camera_effects) = 0; virtual void camera_set_use_vertical_aspect(RID p_camera, bool p_enable) = 0; + /* OCCLUDER API */ + + virtual RID occluder_create() = 0; + virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) = 0; + /* VIEWPORT TARGET API */ enum CanvasItemTextureFilter { @@ -826,6 +831,17 @@ public: virtual void viewport_set_lod_threshold(RID p_viewport, float p_pixels) = 0; + virtual void viewport_set_use_occlusion_culling(RID p_viewport, bool p_use_debanding) = 0; + virtual void viewport_set_occlusion_rays_per_thread(int p_rays_per_thread) = 0; + + enum ViewportOcclusionCullingBuildQuality { + VIEWPORT_OCCLUSION_BUILD_QUALITY_LOW = 0, + VIEWPORT_OCCLUSION_BUILD_QUALITY_MEDIUM = 1, + VIEWPORT_OCCLUSION_BUILD_QUALITY_HIGH = 2, + }; + + virtual void viewport_set_occlusion_culling_build_quality(ViewportOcclusionCullingBuildQuality p_quality) = 0; + enum ViewportRenderInfo { VIEWPORT_RENDER_INFO_OBJECTS_IN_FRAME, VIEWPORT_RENDER_INFO_VERTICES_IN_FRAME, @@ -862,6 +878,7 @@ public: VIEWPORT_DEBUG_DRAW_CLUSTER_SPOT_LIGHTS, VIEWPORT_DEBUG_DRAW_CLUSTER_DECALS, VIEWPORT_DEBUG_DRAW_CLUSTER_REFLECTION_PROBES, + VIEWPORT_DEBUG_DRAW_OCCLUDERS, }; virtual void viewport_set_debug_draw(RID p_viewport, ViewportDebugDraw p_draw) = 0; @@ -1109,6 +1126,7 @@ public: INSTANCE_DECAL, INSTANCE_GI_PROBE, INSTANCE_LIGHTMAP, + INSTANCE_OCCLUDER, INSTANCE_MAX, INSTANCE_GEOMETRY_MASK = (1 << INSTANCE_MESH) | (1 << INSTANCE_MULTIMESH) | (1 << INSTANCE_IMMEDIATE) | (1 << INSTANCE_PARTICLES) @@ -1147,6 +1165,7 @@ public: INSTANCE_FLAG_USE_BAKED_LIGHT, INSTANCE_FLAG_USE_DYNAMIC_GI, INSTANCE_FLAG_DRAW_NEXT_FRAME_IF_VISIBLE, + INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, INSTANCE_FLAG_MAX }; @@ -1505,6 +1524,7 @@ VARIANT_ENUM_CAST(RenderingServer::ViewportMSAA); VARIANT_ENUM_CAST(RenderingServer::ViewportScreenSpaceAA); VARIANT_ENUM_CAST(RenderingServer::ViewportRenderInfo); VARIANT_ENUM_CAST(RenderingServer::ViewportDebugDraw); +VARIANT_ENUM_CAST(RenderingServer::ViewportOcclusionCullingBuildQuality); VARIANT_ENUM_CAST(RenderingServer::SkyMode); VARIANT_ENUM_CAST(RenderingServer::EnvironmentBG); VARIANT_ENUM_CAST(RenderingServer::EnvironmentAmbientSource); diff --git a/thirdparty/README.md b/thirdparty/README.md index 97f21f85393..605b298ac17 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -62,6 +62,26 @@ Files extracted from upstream source: Extracted from .zip provided. Extracted license and header only. +## embree-aarch64 + +- Upstream: https://github.com/lighttransport/embree-aarch64 +- Version: 3.12.1 (6ef362f99af80c9dfe8dd2bfc582d9067897edc6, 2020) +- License: Apache 2.0 + +Files extracted from upstream: + +- All cpp files listed in `modules/raycast/godot_update_embree.py` +- All header files in the directories listed in `modules/raycast/godot_update_embree.py` + +The `modules/raycast/godot_update_embree.py`script can be used to pull the +relevant files from the latest Embree-aarch64 release and apply some automatic changes. + +Some changes have been made in order to remove exceptions and fix minor build errors. +They are marked with `// -- GODOT start --` and `// -- GODOT end --` +comments. Apply the patches in the `patches/` folder when syncing on newer upstream +commits. + + ## enet - Upstream: http://enet.bespin.org diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h b/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h new file mode 100644 index 00000000000..01f1f80f6c5 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h @@ -0,0 +1,55 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include "parallel_reduce.h" + +namespace embree +{ + + template + __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred) + { + bool ret = false; + +#if defined(TASKING_TBB) +#if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(tbb::blocked_range{first, last}, [&ret,pred,&context](const tbb::blocked_range& r) { + if (context.is_group_execution_cancelled()) return; + for (size_t i = r.begin(); i != r.end(); ++i) { + if (pred(i)) { + ret = true; + context.cancel_group_execution(); + } + } + }); +#else + tbb::parallel_for(tbb::blocked_range{first, last}, [&ret,pred](const tbb::blocked_range& r) { + if (tbb::task::self().is_cancelled()) return; + for (size_t i = r.begin(); i != r.end(); ++i) { + if (pred(i)) { + ret = true; + tbb::task::self().cancel_group_execution(); + } + } + }); +#endif +#else + ret = parallel_reduce (first, last, false, [pred](const range& r)->bool { + bool localret = false; + for (auto i=r.begin(); i() + ); +#endif + + return ret; + } + +} // end namespace diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp new file mode 100644 index 00000000000..acddc0ff816 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp @@ -0,0 +1,56 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_filter.h" +#include "../sys/regression.h" +#include + +namespace embree +{ + struct parallel_filter_regression_test : public RegressionTest + { + parallel_filter_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + auto pred = [&]( uint32_t v ) { return (v & 0x3) == 0; }; + + for (size_t N=10; N<1000000; N=size_t(2.1*N)) + { + size_t N0 = rand() % N; + + /* initialize array with random numbers */ + std::vector src(N); + std::map m; + for (size_t i=0; i + inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate) + { + Index j = first; + for (Index i=first; i + inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate) + { + /* sequential fallback */ + if (end-begin <= minStepSize) + return sequential_filter(data,begin,end,predicate); + + /* calculate number of tasks to use */ + enum { MAX_TASKS = 64 }; + const Index numThreads = TaskScheduler::threadCount(); + const Index numBlocks = (end-begin+minStepSize-1)/minStepSize; + const Index taskCount = min(numThreads,numBlocks,(Index)MAX_TASKS); + + /* filter blocks */ + Index nused[MAX_TASKS]; + Index nfree[MAX_TASKS]; + parallel_for(taskCount, [&](const Index taskIndex) + { + const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount; + const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount; + const Index i2 = sequential_filter(data,i0,i1,predicate); + nused[taskIndex] = i2-i0; + nfree[taskIndex] = i1-i2; + }); + + /* calculate offsets */ + Index sused=0; + Index sfree=0; + Index pfree[MAX_TASKS]; + for (Index i=0; i0; i--) + { + if (k0 > r1) break; + Index k1 = k0+nused[i]; + Index src = begin+(i+0)*(end-begin)/taskCount+nused[i]; + for (Index i=max(r0,k0); i= begin && dst < end); + assert(isrc >= begin && isrc < end); + data[dst++] = data[isrc]; + } + k0 = k1; + } + }); + + return begin+sused; + } +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp new file mode 100644 index 00000000000..ef070ebc4d0 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp @@ -0,0 +1,48 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_for.h" +#include "../sys/regression.h" + +namespace embree +{ + struct parallel_for_regression_test : public RegressionTest + { + parallel_for_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + + const size_t M = 10; + for (size_t N=10; N<10000000; N=size_t(2.1*N)) + { + /* sequentially calculate sum of squares */ + size_t sum0 = 0; + for (size_t i=0; i sum1(0); + parallel_for( size_t(0), size_t(N), size_t(1024), [&](const range& r) + { + size_t s = 0; + for (size_t i=r.begin(); i +#include +#include +#endif + +namespace embree +{ + /* parallel_for without range */ + template + __forceinline void parallel_for( const Index N, const Func& func) + { +#if defined(TASKING_INTERNAL) + if (N) { + TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range& r) { + assert(r.size() == 1); + func(r.begin()); + }); + if (!TaskScheduler::wait()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + } +#elif defined(TASKING_GCD) && defined(BUILD_IOS) + + const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? TaskScheduler::threadCount() : 1; + const size_t length = N; + const size_t blockSize = (length + baselineNumBlocks-1) / baselineNumBlocks; + const size_t numBlocks = (length + blockSize-1) / blockSize; + + dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) { + + const size_t start = (currentBlock * blockSize); + const size_t blockLength = std::min(length - start, blockSize); + const size_t end = start + blockLength; + + for(size_t i=start; i < end; i++) + { + func(i); + } + }); + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + }); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + +#elif defined(TASKING_PPL) + concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + }); +#else +# error "no tasking system enabled" +#endif + } + + /* parallel for with range and granulatity */ + template + __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func) + { + assert(first <= last); +#if defined(TASKING_INTERNAL) + TaskScheduler::spawn(first,last,minStepSize,func); + if (!TaskScheduler::wait()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + +#elif defined(TASKING_GCD) && defined(BUILD_IOS) + + const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? 4*TaskScheduler::threadCount() : 1; + const size_t length = last - first; + const size_t blockSizeByThreads = (length + baselineNumBlocks-1) / baselineNumBlocks; + size_t blockSize = std::max(minStepSize,blockSizeByThreads); + blockSize += blockSize % 4; + + const size_t numBlocks = (length + blockSize-1) / blockSize; + + dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) { + + const size_t start = first + (currentBlock * blockSize); + const size_t end = std::min(last, start + blockSize); + + func( embree::range(start,end) ); + }); + + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(tbb::blocked_range(first,last,minStepSize),[&](const tbb::blocked_range& r) { + func(range(r.begin(),r.end())); + },context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(tbb::blocked_range(first,last,minStepSize),[&](const tbb::blocked_range& r) { + func(range(r.begin(),r.end())); + }); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + +#elif defined(TASKING_PPL) + concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { + func(range(i,i+1)); + }); + +#else +# error "no tasking system enabled" +#endif + } + + /* parallel for with range */ + template + __forceinline void parallel_for( const Index first, const Index last, const Func& func) + { + assert(first <= last); + parallel_for(first,last,(Index)1,func); + } + +#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001) + + template + __forceinline void parallel_for_static( const Index N, const Func& func) + { + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },tbb::simple_partitioner(),context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },tbb::simple_partitioner()); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + } + + typedef tbb::affinity_partitioner affinity_partitioner; + + template + __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap) + { + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },ap,context); + if (context.is_group_execution_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },ap); + if (tbb::task::self().is_cancelled()) + // -- GODOT start -- + // throw std::runtime_error("task cancelled"); + abort(); + // -- GODOT end -- + #endif + } + +#else + + template + __forceinline void parallel_for_static( const Index N, const Func& func) + { + parallel_for(N,func); + } + + struct affinity_partitioner { + }; + + template + __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) + { + parallel_for(N,func); + } + +#endif +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp new file mode 100644 index 00000000000..0337611b350 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp @@ -0,0 +1,63 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_for_for.h" +#include "../sys/regression.h" + +namespace embree +{ + struct parallel_for_for_regression_test : public RegressionTest + { + parallel_for_for_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + + /* create vector with random numbers */ + size_t sum0 = 0; + size_t K = 0; + const size_t M = 1000; + std::vector* > array2(M); + for (size_t i=0; i(N); + for (size_t j=0; j> verify_k(K); + for (size_t i=0; i sum1(0); + parallel_for_for( array2, size_t(1), [&](std::vector* v, const range& r, size_t k) -> size_t + { + size_t s = 0; + for (size_t i=r.begin(); i + __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) + { + size_t k=0; + for (size_t i=0; i!=array2.size(); ++i) { + const size_t N = array2[i]->size(); + if (N) func(array2[i],range(0,N),k); + k+=N; + } + } + + class ParallelForForState + { + public: + + enum { MAX_TASKS = 64 }; + + __forceinline ParallelForForState () + : taskCount(0) {} + + template + __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) { + init(array2,minStepSize); + } + + template + __forceinline void init ( ArrayArray& array2, const size_t minStepSize ) + { + /* first calculate total number of elements */ + size_t N = 0; + for (size_t i=0; isize() : 0; + } + this->N = N; + + /* calculate number of tasks to use */ + const size_t numThreads = TaskScheduler::threadCount(); + const size_t numBlocks = (N+minStepSize-1)/minStepSize; + taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS))); + + /* calculate start (i,j) for each task */ + size_t taskIndex = 0; + i0[taskIndex] = 0; + j0[taskIndex] = 0; + size_t k0 = (++taskIndex)*N/taskCount; + for (size_t i=0, k=0; taskIndex < taskCount; i++) + { + assert(isize() : 0; + while (j= k0 && taskIndex < taskCount) { + assert(taskIndex + __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) + { + ParallelForForState state(array2,minStepSize); + + parallel_for(state.taskCount, [&](const size_t taskIndex) + { + /* calculate range */ + const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; + const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + for (size_t i=i0; ksize() : 0; + const size_t r0 = j0, r1 = min(N,r0+k1-k); + if (r1 > r0) func(array2[i],range(r0,r1),k); + k+=r1-r0; j0 = 0; + } + }); + } + + template + __forceinline void parallel_for_for( ArrayArray& array2, const Func& func ) + { + parallel_for_for(array2,1,func); + } + + template + __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + ParallelForForState state(array2,minStepSize); + Value temp[ParallelForForState::MAX_TASKS]; + + for (size_t i=0; isize() : 0; + const size_t r0 = j0, r1 = min(N,r0+k1-k); + if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range(r0,r1),k)); + k+=r1-r0; j0 = 0; + } + }); + + Value ret = identity; + for (size_t i=0; i + __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_reduce(array2,1,identity,func,reduction); + } +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp new file mode 100644 index 00000000000..0169d8e4818 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp @@ -0,0 +1,85 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_for_for_prefix_sum.h" +#include "../sys/regression.h" + +namespace embree +{ + struct parallel_for_for_prefix_sum_regression_test : public RegressionTest + { + parallel_for_for_prefix_sum_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + + /* create vector with random numbers */ + const size_t M = 10; + std::vector> flattened; + typedef std::vector* > ArrayArray; + ArrayArray array2(M); + size_t K = 0; + for (size_t i=0; i(N); + for (size_t j=0; j> verify_k(K); + for (size_t i=0; i state(array2,size_t(1)); + + /* dry run only counts */ + size_t S = parallel_for_for_prefix_sum0( state, array2, size_t(0), [&](std::vector* v, const range& r, size_t k, size_t i) -> size_t + { + size_t s = 0; + for (size_t i=r.begin(); i* v, const range& r, size_t k, size_t i, const size_t base) -> size_t + { + size_t s = 0; + for (size_t i=r.begin(); i + struct ParallelForForPrefixSumState : public ParallelForForState + { + __forceinline ParallelForForPrefixSumState () {} + + template + __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize) + : ParallelForForState(array2,minStepSize) {} + + ParallelPrefixSumState prefix_state; + }; + + template + __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState& state, ArrayArray& array2, Index minStepSize, + const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t taskCount = state.taskCount; + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t k0 = (taskIndex+0)*state.size()/taskCount; + const size_t k1 = (taskIndex+1)*state.size()/taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + Value N=identity; + for (size_t i=i0; ksize() : 0; + const size_t r0 = j0, r1 = min(size,r0+k1-k); + if (r1 > r0) N = reduction(N, func(array2[i],range((Index)r0,(Index)r1),(Index)k,(Index)i)); + k+=r1-r0; j0 = 0; + } + state.prefix_state.counts[taskIndex] = N; + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i + __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState& state, ArrayArray& array2, Index minStepSize, + const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t taskCount = state.taskCount; + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t k0 = (taskIndex+0)*state.size()/taskCount; + const size_t k1 = (taskIndex+1)*state.size()/taskCount; + size_t i0 = state.i0[taskIndex]; + size_t j0 = state.j0[taskIndex]; + + /* iterate over arrays */ + size_t k=k0; + Value N=identity; + for (size_t i=i0; ksize() : 0; + const size_t r0 = j0, r1 = min(size,r0+k1-k); + if (r1 > r0) N = reduction(N, func(array2[i],range((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N))); + k+=r1-r0; j0 = 0; + } + state.prefix_state.counts[taskIndex] = N; + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i + __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState& state, ArrayArray& array2, + const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction); + } + + template + __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState& state, ArrayArray& array2, + const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction); + } +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp new file mode 100644 index 00000000000..09dc303f814 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp @@ -0,0 +1,47 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_map.h" +#include "../sys/regression.h" + +namespace embree +{ + struct parallel_map_regression_test : public RegressionTest + { + parallel_map_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + + /* create key/value vectors with random numbers */ + const size_t N = 10000; + std::vector keys(N); + std::vector vals(N); + for (size_t i=0; i map; + map.init(keys,vals); + + /* check that all keys are properly mapped */ + for (size_t i=0; i + class parallel_map + { + /* key/value pair to build the map */ + struct KeyValue + { + __forceinline KeyValue () {} + + __forceinline KeyValue (const Key key, const Val val) + : key(key), val(val) {} + + __forceinline operator Key() const { + return key; + } + + public: + Key key; + Val val; + }; + + public: + + /*! parallel map constructors */ + parallel_map () {} + + /*! construction from pair of vectors */ + template + parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); } + + /*! initialized the parallel map from a vector with keys and values */ + template + void init(const KeyVector& keys, const ValVector& values) + { + /* reserve sufficient space for all data */ + assert(keys.size() == values.size()); + vec.resize(keys.size()); + + /* generate key/value pairs */ + parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range& r) { + for (size_t i=r.begin(); i temp(keys.size()); + radix_sort(vec.data(),temp.data(),keys.size()); + } + + /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */ + __forceinline const Val* lookup(const Key& key) const + { + typename std::vector::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); + if (i == vec.end()) return nullptr; + if (i->key != key) return nullptr; + return &i->val; + } + + /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */ + __forceinline Val lookup(const Key& key, const Val& def) const + { + typename std::vector::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key); + if (i == vec.end()) return def; + if (i->key != key) return def; + return i->val; + } + + /*! clears all state */ + void clear() { + vec.clear(); + } + + private: + std::vector vec; //!< vector containing sorted elements + }; +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp new file mode 100644 index 00000000000..eb20c4465d2 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp @@ -0,0 +1,53 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_partition.h" +#include "../sys/regression.h" + +namespace embree +{ + struct parallel_partition_regression_test : public RegressionTest + { + parallel_partition_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + + for (size_t i=0; i<100; i++) + { + /* create random permutation */ + size_t N = std::rand() % 1000000; + std::vector array(N); + for (unsigned i=0; i= split; + } + + return passed; + } + }; + + parallel_partition_regression_test parallel_partition_regression("parallel_partition_regression_test"); +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h new file mode 100644 index 00000000000..3b3ad7c8541 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h @@ -0,0 +1,283 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "parallel_for.h" +#include "../math/range.h" + +namespace embree +{ + /* serial partitioning */ + template + __forceinline size_t serial_partitioning(T* array, + const size_t begin, + const size_t end, + V& leftReduction, + V& rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t) + { + T* l = array + begin; + T* r = array + end - 1; + + while(1) + { + /* *l < pivot */ + while (likely(l <= r && is_left(*l) )) + { + //prefetchw(l+4); // FIXME: enable? + reduction_t(leftReduction,*l); + ++l; + } + /* *r >= pivot) */ + while (likely(l <= r && !is_left(*r))) + { + //prefetchw(r-4); FIXME: enable? + reduction_t(rightReduction,*r); + --r; + } + if (r + class __aligned(64) parallel_partition_task + { + ALIGNED_CLASS_(64); + private: + + static const size_t MAX_TASKS = 64; + + T* array; + size_t N; + const IsLeft& is_left; + const Reduction_T& reduction_t; + const Reduction_V& reduction_v; + const Vi& identity; + + size_t numTasks; + __aligned(64) size_t counter_start[MAX_TASKS+1]; + __aligned(64) size_t counter_left[MAX_TASKS+1]; + __aligned(64) range leftMisplacedRanges[MAX_TASKS]; + __aligned(64) range rightMisplacedRanges[MAX_TASKS]; + __aligned(64) V leftReductions[MAX_TASKS]; + __aligned(64) V rightReductions[MAX_TASKS]; + + public: + + __forceinline parallel_partition_task(T* array, + const size_t N, + const Vi& identity, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + const size_t BLOCK_SIZE) + + : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity), + numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {} + + __forceinline const range* findStartRange(size_t& index, const range* const r, const size_t numRanges) + { + size_t i = 0; + while(index >= (size_t)r[i].size()) + { + assert(i < numRanges); + index -= (size_t)r[i].size(); + i++; + } + return &r[i]; + } + + __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges, + const size_t numRightMisplacedRanges, + const size_t startID, + const size_t endID) + { + size_t leftLocalIndex = startID; + size_t rightLocalIndex = startID; + const range* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges); + const range* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges); + + size_t l_left = l_range->size() - leftLocalIndex; + size_t r_left = r_range->size() - rightLocalIndex; + T *__restrict__ l = &array[l_range->begin() + leftLocalIndex]; + T *__restrict__ r = &array[r_range->begin() + rightLocalIndex]; + size_t size = endID - startID; + size_t items = min(size,min(l_left,r_left)); + + while (size) + { + if (unlikely(l_left == 0)) + { + l_range++; + l_left = l_range->size(); + l = &array[l_range->begin()]; + items = min(size,min(l_left,r_left)); + } + + if (unlikely(r_left == 0)) + { + r_range++; + r_left = r_range->size(); + r = &array[r_range->begin()]; + items = min(size,min(l_left,r_left)); + } + + size -= items; + l_left -= items; + r_left -= items; + + while(items) { + items--; + xchg(*l++,*r++); + } + } + } + + __forceinline size_t partition(V& leftReduction, V& rightReduction) + { + /* partition the individual ranges for each task */ + parallel_for(numTasks,[&] (const size_t taskID) { + const size_t startID = (taskID+0)*N/numTasks; + const size_t endID = (taskID+1)*N/numTasks; + V local_left(identity); + V local_right(identity); + const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t); + counter_start[taskID] = startID; + counter_left [taskID] = mid-startID; + leftReductions[taskID] = local_left; + rightReductions[taskID] = local_right; + }); + counter_start[numTasks] = N; + counter_left[numTasks] = 0; + + /* finalize the reductions */ + for (size_t i=0; i globalLeft (0,mid); + const range globalRight(mid,N); + + /* calculate all left and right ranges that are on the wrong global side */ + size_t numMisplacedRangesLeft = 0; + size_t numMisplacedRangesRight = 0; + size_t numMisplacedItemsLeft = 0; + size_t numMisplacedItemsRight = 0; + + for (size_t i=0; i left_range (counter_start[i], counter_start[i] + counter_left[i]); + const range right_range(counter_start[i] + counter_left[i], counter_start[i+1]); + const range left_misplaced = globalLeft. intersect(right_range); + const range right_misplaced = globalRight.intersect(left_range); + + if (!left_misplaced.empty()) + { + numMisplacedItemsLeft += left_misplaced.size(); + leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced; + } + + if (!right_misplaced.empty()) + { + numMisplacedItemsRight += right_misplaced.size(); + rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced; + } + } + assert( numMisplacedItemsLeft == numMisplacedItemsRight ); + + /* if no items are misplaced we are done */ + if (numMisplacedItemsLeft == 0) + return mid; + + /* otherwise we copy the items to the right place in parallel */ + parallel_for(numTasks,[&] (const size_t taskID) { + const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks; + const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks; + swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID); + }); + + return mid; + } + }; + + template + __noinline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const Vi &identity, + V &leftReduction, + V &rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + size_t BLOCK_SIZE = 128) + { + /* fall back to single threaded partitioning for small N */ + if (unlikely(end-begin < BLOCK_SIZE)) + return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); + + /* otherwise use parallel code */ + else { + typedef parallel_partition_task partition_task; + std::unique_ptr p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); + return begin+p->partition(leftReduction,rightReduction); + } + } + + template + __noinline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const Vi &identity, + V &leftReduction, + V &rightReduction, + const IsLeft& is_left, + const Reduction_T& reduction_t, + const Reduction_V& reduction_v, + size_t BLOCK_SIZE, + size_t PARALLEL_THRESHOLD) + { + /* fall back to single threaded partitioning for small N */ + if (unlikely(end-begin < PARALLEL_THRESHOLD)) + return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); + + /* otherwise use parallel code */ + else { + typedef parallel_partition_task partition_task; + std::unique_ptr p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); + return begin+p->partition(leftReduction,rightReduction); + } + } + + + template + inline size_t parallel_partitioning(T* array, + const size_t begin, + const size_t end, + const IsLeft& is_left, + size_t BLOCK_SIZE = 128) + { + size_t leftReduction = 0; + size_t rightReduction = 0; + return parallel_partitioning( + array,begin,end,0,leftReduction,rightReduction,is_left, + [] (size_t& t,const T& ref) { }, + [] (size_t& t0,size_t& t1) { }, + BLOCK_SIZE); + } + +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp new file mode 100644 index 00000000000..685952c3dce --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp @@ -0,0 +1,48 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_prefix_sum.h" +#include "../sys/regression.h" + +namespace embree +{ + struct parallel_prefix_sum_regression_test : public RegressionTest + { + parallel_prefix_sum_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + const size_t M = 10; + + for (size_t N=10; N<10000000; N=size_t(2.1*N)) + { + /* initialize array with random numbers */ + uint32_t sum0 = 0; + std::vector src(N); + for (size_t i=0; i dst(N); + for (auto& v : dst) v = 0; + + for (size_t i=0; i()); + passed &= (sum0 == sum1); + } + + /* check if prefix sum is correct */ + for (size_t i=0, sum=0; i + struct ParallelPrefixSumState + { + enum { MAX_TASKS = 64 }; + Value counts[MAX_TASKS]; + Value sums [MAX_TASKS]; + }; + + template + __forceinline Value parallel_prefix_sum( ParallelPrefixSumState& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction) + { + /* calculate number of tasks to use */ + const size_t numThreads = TaskScheduler::threadCount(); + const size_t numBlocks = (last-first+minStepSize-1)/minStepSize; + const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState::MAX_TASKS)); + + /* perform parallel prefix sum */ + parallel_for(taskCount, [&](const size_t taskIndex) + { + const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount; + const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount; + state.counts[taskIndex] = func(range(i0,i1),state.sums[taskIndex]); + }); + + /* calculate prefix sum */ + Value sum=identity; + for (size_t i=0; i + __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) + { + /* perform single threaded prefix operation for small N */ + if (N < SINGLE_THREAD_THRESHOLD) + { + Value sum=identity; + for (size_t i=0; i state; + + /* initial run just sets up start values for subtasks */ + parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range& r, const Value& sum) -> Value { + + Value s = identity; + for (size_t i=r.begin(); i& r, const Value& sum) -> Value { + + Value s = identity; + for (size_t i=r.begin(); i& r) -> size_t + { + size_t s = 0; + for (size_t i=r.begin(); i + __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) + { + return func(range(first,last)); + } + + template + __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + return func(range(first,last)); + } + + template + __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { + const Index maxTasks = 512; + const Index threadCount = (Index) TaskScheduler::threadCount(); + taskCount = min(taskCount,threadCount,maxTasks); + + /* parallel invokation of all tasks */ + dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack + parallel_for(taskCount, [&](const Index taskIndex) { + const Index k0 = first+(taskIndex+0)*(last-first)/taskCount; + const Index k1 = first+(taskIndex+1)*(last-first)/taskCount; + values[taskIndex] = func(range(k0,k1)); + }); + + /* perform reduction over all tasks */ + Value v = identity; + for (Index i=0; i + __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) + { +#if defined(TASKING_INTERNAL) || (defined(TASKING_GCD) && defined(BUILD_IOS)) + + /* fast path for small number of iterations */ + Index taskCount = (last-first+minStepSize-1)/minStepSize; + if (likely(taskCount == 1)) { + return func(range(first,last)); + } + return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction); + +#elif defined(TASKING_TBB) + #if TBB_INTERFACE_VERSION >= 12002 + tbb::task_group_context context; + const Value v = tbb::parallel_reduce(tbb::blocked_range(first,last,minStepSize),identity, + [&](const tbb::blocked_range& r, const Value& start) { return reduction(start,func(range(r.begin(),r.end()))); }, + reduction,context); + // -- GODOT start -- + // if (context.is_group_execution_cancelled()) + // throw std::runtime_error("task cancelled"); + // -- GODOT end -- + return v; + #else + const Value v = tbb::parallel_reduce(tbb::blocked_range(first,last,minStepSize),identity, + [&](const tbb::blocked_range& r, const Value& start) { return reduction(start,func(range(r.begin(),r.end()))); }, + reduction); + // -- GODOT start -- + // if (tbb::task::self().is_cancelled()) + // throw std::runtime_error("task cancelled"); + // -- GODOT end -- + return v; + #endif +#else // TASKING_PPL + struct AlignedValue + { + char storage[__alignof(Value)+sizeof(Value)]; + static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); }; + Value* getValuePtr() { return reinterpret_cast(alignUp(uintptr_t(storage), __alignof(Value))); } + const Value* getValuePtr() const { return reinterpret_cast(alignUp(uintptr_t(storage), __alignof(Value))); } + AlignedValue(const Value& v) { new(getValuePtr()) Value(v); } + AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); } + AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); }; + AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; + AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; }; + operator Value() const { return *getValuePtr(); } + }; + + struct Iterator_Index + { + Index v; + typedef std::forward_iterator_tag iterator_category; + typedef AlignedValue value_type; + typedef Index difference_type; + typedef Index distance_type; + typedef AlignedValue* pointer; + typedef AlignedValue& reference; + __forceinline Iterator_Index() {} + __forceinline Iterator_Index(Index v) : v(v) {} + __forceinline bool operator== (Iterator_Index other) { return v == other.v; } + __forceinline bool operator!= (Iterator_Index other) { return v != other.v; } + __forceinline Iterator_Index operator++() { return Iterator_Index(++v); } + __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); } + }; + + auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) { + assert(begin.v < end.v); + return reduction(start, func(range(begin.v, end.v))); + }; + const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction); + return v; +#endif + } + + template + __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) + { + if (likely(last-first < parallel_threshold)) { + return func(range(first,last)); + } else { + return parallel_reduce(first,last,minStepSize,identity,func,reduction); + } + } + + template + __forceinline Value parallel_reduce( const range range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) + { + return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction); + } + + template + __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) + { + auto funcr = [&] ( const range r ) { + Value v = identity; + for (Index i=r.begin(); i + __forceinline Value parallel_reduce( const range range, const Value& identity, const Func& func, const Reduction& reduction ) + { + return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction); + } +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp new file mode 100644 index 00000000000..20b639c1c9b --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp @@ -0,0 +1,43 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_set.h" +#include "../sys/regression.h" + +namespace embree +{ + struct parallel_set_regression_test : public RegressionTest + { + parallel_set_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + + /* create vector with random numbers */ + const size_t N = 10000; + std::vector unsorted(N); + for (size_t i=0; i sorted; + sorted.init(unsorted); + + /* check that all elements are in the set */ + for (size_t i=0; i + class parallel_set + { + public: + + /*! default constructor for the parallel set */ + parallel_set () {} + + /*! construction from vector */ + template + parallel_set (const Vector& in) { init(in); } + + /*! initialized the parallel set from a vector */ + template + void init(const Vector& in) + { + /* copy data to internal vector */ + vec.resize(in.size()); + parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range& r) { + for (size_t i=r.begin(); i temp(in.size()); + radix_sort(vec.data(),temp.data(),vec.size()); + } + + /*! tests if some element is in the set */ + __forceinline bool lookup(const T& elt) const { + return std::binary_search(vec.begin(), vec.end(), elt); + } + + /*! clears all state */ + void clear() { + vec.clear(); + } + + private: + std::vector vec; //!< vector containing sorted elements + }; +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp new file mode 100644 index 00000000000..5e7ec79ac19 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp @@ -0,0 +1,50 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "parallel_sort.h" +#include "../sys/regression.h" + +namespace embree +{ + template + struct RadixSortRegressionTest : public RegressionTest + { + RadixSortRegressionTest(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + const size_t M = 10; + + for (size_t N=10; N<1000000; N=size_t(2.1*N)) + { + std::vector src(N); memset(src.data(),0,N*sizeof(Key)); + std::vector tmp(N); memset(tmp.data(),0,N*sizeof(Key)); + for (size_t i=0; i(src.data(),tmp.data(),N); + } + + /* calculate checksum */ + Key sum1 = 0; for (size_t i=0; i test_u32("RadixSortRegressionTestU32"); + RadixSortRegressionTest test_u64("RadixSortRegressionTestU64"); +} diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h new file mode 100644 index 00000000000..a758227c1b8 --- /dev/null +++ b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h @@ -0,0 +1,457 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../simd/simd.h" +#include "parallel_for.h" +#if defined(TASKING_GCD) && defined(BUILD_IOS) +#include "../sys/alloc.h" +#endif +#include + +namespace embree +{ + template + __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length) + { + for(size_t i = 1;i 0 && v < array[j-1]) + { + array[j] = array[j-1]; + --j; + } + array[j] = v; + } + } + + template + __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length) + { + for(size_t i = 1;i 0 && v > array[j-1]) + { + array[j] = array[j-1]; + --j; + } + array[j] = v; + } + } + + template + void quicksort_ascending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] > pivotvalue); + while (t[++left] < pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const int pivot = right; + quicksort_ascending(t, begin, pivot); + quicksort_ascending(t, pivot + 1, end); + } + } + + template + void quicksort_decending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] < pivotvalue); + while (t[++left] > pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const int pivot = right; + quicksort_decending(t, begin, pivot); + quicksort_decending(t, pivot + 1, end); + } + } + + + template + void quicksort_insertionsort_ascending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const ssize_t size = end-begin+1; + if (likely(size <= THRESHOLD)) + { + insertionsort_ascending(&t[begin],size); + } + else + { + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] > pivotvalue); + while (t[++left] < pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const ssize_t pivot = right; + quicksort_insertionsort_ascending(t, begin, pivot); + quicksort_insertionsort_ascending(t, pivot + 1, end); + } + } + } + + + template + void quicksort_insertionsort_decending(T *__restrict__ t, + const ssize_t begin, + const ssize_t end) + { + if (likely(begin < end)) + { + const ssize_t size = end-begin+1; + if (likely(size <= THRESHOLD)) + { + insertionsort_decending(&t[begin],size); + } + else + { + + const T pivotvalue = t[begin]; + ssize_t left = begin - 1; + ssize_t right = end + 1; + + while(1) + { + while (t[--right] < pivotvalue); + while (t[++left] > pivotvalue); + + if (left >= right) break; + + const T temp = t[right]; + t[right] = t[left]; + t[left] = temp; + } + + const ssize_t pivot = right; + quicksort_insertionsort_decending(t, begin, pivot); + quicksort_insertionsort_decending(t, pivot + 1, end); + } + } + } + + template + static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8) + { + static const unsigned int BITS = 8; + static const unsigned int BUCKETS = (1 << BITS); + static const unsigned int CMP_SORT_THRESHOLD = 16; + + __aligned(64) unsigned int count[BUCKETS]; + + /* clear buckets */ + for (size_t i=0;i> shift) & (BUCKETS-1)]++; + + /* prefix sums */ + __aligned(64) unsigned int head[BUCKETS]; + __aligned(64) unsigned int tail[BUCKETS]; + + head[0] = 0; + for (size_t i=1; i> shift) & (BUCKETS-1); + if (b == i) break; + std::swap(v,morton[head[b]++]); + } + assert((unsigned(v) >> shift & (BUCKETS-1)) == i); + morton[head[i]++] = v; + } + } + if (shift == 0) return; + + size_t offset = 0; + for (size_t i=0;i> shift) & (BUCKETS-1)) == i); + + if (unlikely(count[i] < CMP_SORT_THRESHOLD)) + insertionsort_ascending(morton + offset, count[i]); + else + radixsort32(morton + offset, count[i], shift-BITS); + + for (size_t j=offset;j + class ParallelRadixSort + { + static const size_t MAX_TASKS = 64; + static const size_t BITS = 8; + static const size_t BUCKETS = (1 << BITS); + typedef unsigned int TyRadixCount[BUCKETS]; + + template + static bool compare(const T& v0, const T& v1) { + return (Key)v0 < (Key)v1; + } + + private: + ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement + ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement + + + public: + ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N) + : radixCount(nullptr), src(src), tmp(tmp), N(N) {} + + void sort(const size_t blockSize) + { + assert(blockSize > 0); + + /* perform single threaded sort for small N */ + if (N<=blockSize) // handles also special case of 0! + { + /* do inplace sort inside destination array */ + std::sort(src,src+N,compare); + } + + /* perform parallel sort for large N */ + else + { + const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS)); + tbbRadixSort(numThreads); + } + } + + ~ParallelRadixSort() + { + alignedFree(radixCount); + radixCount = nullptr; + } + + private: + + void tbbRadixIteration0(const Key shift, + const Ty* __restrict const src, + Ty* __restrict const dst, + const size_t threadIndex, const size_t threadCount) + { + const size_t startID = (threadIndex+0)*N/threadCount; + const size_t endID = (threadIndex+1)*N/threadCount; + + /* mask to extract some number of bits */ + const Key mask = BUCKETS-1; + + /* count how many items go into the buckets */ + for (size_t i=0; i> (size_t)shift) & (size_t)mask; +#else + const Key index = ((Key)src[i] >> shift) & mask; +#endif + count[index]++; + } + } + + void tbbRadixIteration1(const Key shift, + const Ty* __restrict const src, + Ty* __restrict const dst, + const size_t threadIndex, const size_t threadCount) + { + const size_t startID = (threadIndex+0)*N/threadCount; + const size_t endID = (threadIndex+1)*N/threadCount; + + /* mask to extract some number of bits */ + const Key mask = BUCKETS-1; + + /* calculate total number of items for each bucket */ + __aligned(64) unsigned int total[BUCKETS]; + /* + for (size_t i=0; i> (size_t)shift) & (size_t)mask; +#else + const size_t index = ((Key)src[i] >> shift) & mask; +#endif + dst[offset[index]++] = elt; + } + } + + void tbbRadixIteration(const Key shift, const bool last, + const Ty* __restrict src, Ty* __restrict dst, + const size_t numTasks) + { + affinity_partitioner ap; + parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap); + parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap); + } + + void tbbRadixSort(const size_t numTasks) + { + radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64); + + if (sizeof(Key) == sizeof(uint32_t)) { + tbbRadixIteration(0*BITS,0,src,tmp,numTasks); + tbbRadixIteration(1*BITS,0,tmp,src,numTasks); + tbbRadixIteration(2*BITS,0,src,tmp,numTasks); + tbbRadixIteration(3*BITS,1,tmp,src,numTasks); + } + else if (sizeof(Key) == sizeof(uint64_t)) + { + tbbRadixIteration(0*BITS,0,src,tmp,numTasks); + tbbRadixIteration(1*BITS,0,tmp,src,numTasks); + tbbRadixIteration(2*BITS,0,src,tmp,numTasks); + tbbRadixIteration(3*BITS,0,tmp,src,numTasks); + tbbRadixIteration(4*BITS,0,src,tmp,numTasks); + tbbRadixIteration(5*BITS,0,tmp,src,numTasks); + tbbRadixIteration(6*BITS,0,src,tmp,numTasks); + tbbRadixIteration(7*BITS,1,tmp,src,numTasks); + } + } + + private: + TyRadixCount* radixCount; + Ty* const src; + Ty* const tmp; + const size_t N; + }; + + template + void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) + { + ParallelRadixSort(src,tmp,N).sort(blockSize); + } + + template + void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) + { + ParallelRadixSort(src,tmp,N).sort(blockSize); + } + + template + void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { + radix_sort(src,tmp,N,blockSize); + } + + template + void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { + radix_sort(src,tmp,N,blockSize); + } +} diff --git a/thirdparty/embree-aarch64/common/lexers/parsestream.h b/thirdparty/embree-aarch64/common/lexers/parsestream.h new file mode 100644 index 00000000000..db46dc114fb --- /dev/null +++ b/thirdparty/embree-aarch64/common/lexers/parsestream.h @@ -0,0 +1,101 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stringstream.h" +#include "../sys/filename.h" +#include "../math/vec2.h" +#include "../math/vec3.h" +#include "../math/col3.h" +#include "../math/color.h" + +namespace embree +{ + /*! helper class for simple command line parsing */ + class ParseStream : public Stream + { + public: + ParseStream (const Ref >& cin) : cin(cin) {} + + ParseStream (const Ref >& cin, const std::string& seps = "\n\t\r ", + const std::string& endl = "", bool multiLine = false) + : cin(new StringStream(cin,seps,endl,multiLine)) {} + + public: + ParseLocation location() { return cin->loc(); } + std::string next() { return cin->get(); } + + void force(const std::string& next) { + std::string token = getString(); + if (token != next) + THROW_RUNTIME_ERROR("token \""+next+"\" expected but token \""+token+"\" found"); + } + + std::string getString() { + return get(); + } + + FileName getFileName() { + return FileName(get()); + } + + int getInt () { + return atoi(get().c_str()); + } + + Vec2i getVec2i() { + int x = atoi(get().c_str()); + int y = atoi(get().c_str()); + return Vec2i(x,y); + } + + Vec3ia getVec3ia() { + int x = atoi(get().c_str()); + int y = atoi(get().c_str()); + int z = atoi(get().c_str()); + return Vec3ia(x,y,z); + } + + float getFloat() { + return (float)atof(get().c_str()); + } + + Vec2f getVec2f() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + return Vec2f(x,y); + } + + Vec3f getVec3f() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + float z = (float)atof(get().c_str()); + return Vec3f(x,y,z); + } + + Vec3fa getVec3fa() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + float z = (float)atof(get().c_str()); + return Vec3fa(x,y,z); + } + + Col3f getCol3f() { + float x = (float)atof(get().c_str()); + float y = (float)atof(get().c_str()); + float z = (float)atof(get().c_str()); + return Col3f(x,y,z); + } + + Color getColor() { + float r = (float)atof(get().c_str()); + float g = (float)atof(get().c_str()); + float b = (float)atof(get().c_str()); + return Color(r,g,b); + } + + private: + Ref > cin; + }; +} diff --git a/thirdparty/embree-aarch64/common/lexers/stream.h b/thirdparty/embree-aarch64/common/lexers/stream.h new file mode 100644 index 00000000000..3f75677e687 --- /dev/null +++ b/thirdparty/embree-aarch64/common/lexers/stream.h @@ -0,0 +1,215 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/ref.h" +#include "../sys/filename.h" +#include "../sys/string.h" + +#include +#include +#include +#include + +namespace embree +{ + /*! stores the location of a stream element in the source */ + class ParseLocation + { + public: + ParseLocation () : lineNumber(-1), colNumber(-1) {} + ParseLocation (std::shared_ptr fileName, ssize_t lineNumber, ssize_t colNumber, ssize_t /*charNumber*/) + : fileName(fileName), lineNumber(lineNumber), colNumber(colNumber) {} + + std::string str() const + { + std::string str = "unknown"; + if (fileName) str = *fileName; + if (lineNumber >= 0) str += " line " + toString(lineNumber); + if (lineNumber >= 0 && colNumber >= 0) str += " character " + toString(colNumber); + return str; + } + + private: + std::shared_ptr fileName; /// name of the file (or stream) the token is from + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + }; + + /*! a stream class templated over the stream elements */ + template class Stream : public RefCount + { + enum { BUF_SIZE = 1024 }; + + private: + virtual T next() = 0; + virtual ParseLocation location() = 0; + __forceinline std::pair nextHelper() { + ParseLocation l = location(); + T v = next(); + return std::pair(v,l); + } + __forceinline void push_back(const std::pair& v) { + if (past+future == BUF_SIZE) pop_front(); + size_t end = (start+past+future++)%BUF_SIZE; + buffer[end] = v; + } + __forceinline void pop_front() { + if (past == 0) THROW_RUNTIME_ERROR("stream buffer empty"); + start = (start+1)%BUF_SIZE; past--; + } + public: + Stream () : start(0), past(0), future(0), buffer(BUF_SIZE) {} + virtual ~Stream() {} + + public: + + const ParseLocation& loc() { + if (future == 0) push_back(nextHelper()); + return buffer[(start+past)%BUF_SIZE].second; + } + T get() { + if (future == 0) push_back(nextHelper()); + T t = buffer[(start+past)%BUF_SIZE].first; + past++; future--; + return t; + } + const T& peek() { + if (future == 0) push_back(nextHelper()); + return buffer[(start+past)%BUF_SIZE].first; + } + const T& unget(size_t n = 1) { + if (past < n) THROW_RUNTIME_ERROR ("cannot unget that many items"); + past -= n; future += n; + return peek(); + } + void drop() { + if (future == 0) push_back(nextHelper()); + past++; future--; + } + private: + size_t start,past,future; + std::vector > buffer; + }; + + /*! warps an iostream stream */ + class StdStream : public Stream + { + public: + StdStream (std::istream& cin, const std::string& name = "std::stream") + : cin(cin), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr(new std::string(name))) {} + ~StdStream() {} + ParseLocation location() { + return ParseLocation(name,lineNumber,colNumber,charNumber); + } + int next() { + int c = cin.get(); + if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; + charNumber++; + return c; + } + private: + std::istream& cin; + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + ssize_t charNumber; /// the character in the file + std::shared_ptr name; /// name of buffer + }; + + /*! creates a stream from a file */ + class FileStream : public Stream + { + public: + + FileStream (FILE* file, const std::string& name = "file") + : file(file), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr(new std::string(name))) {} + + FileStream (const FileName& fileName) + : lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr(new std::string(fileName.str()))) + { + file = fopen(fileName.c_str(),"r"); + if (file == nullptr) THROW_RUNTIME_ERROR("cannot open file " + fileName.str()); + } + ~FileStream() { if (file) fclose(file); } + + public: + ParseLocation location() { + return ParseLocation(name,lineNumber,colNumber,charNumber); + } + + int next() { + int c = fgetc(file); + if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; + charNumber++; + return c; + } + + private: + FILE* file; + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + ssize_t charNumber; /// the character in the file + std::shared_ptr name; /// name of buffer + }; + + /*! creates a stream from a string */ + class StrStream : public Stream + { + public: + + StrStream (const char* str) + : str(str), lineNumber(1), colNumber(0), charNumber(0) {} + + public: + ParseLocation location() { + return ParseLocation(std::shared_ptr(),lineNumber,colNumber,charNumber); + } + + int next() { + int c = str[charNumber]; + if (c == 0) return EOF; + if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++; + charNumber++; + return c; + } + + private: + const char* str; + ssize_t lineNumber; /// the line number the token is from + ssize_t colNumber; /// the character number in the current line + ssize_t charNumber; /// the character in the file + }; + + /*! creates a character stream from a command line */ + class CommandLineStream : public Stream + { + public: + CommandLineStream (int argc, char** argv, const std::string& name = "command line") + : i(0), j(0), charNumber(0), name(std::shared_ptr(new std::string(name))) + { + if (argc > 0) { + for (size_t i=0; argv[0][i] && i<1024; i++) charNumber++; + charNumber++; + } + for (ssize_t k=1; k args; + ssize_t charNumber; /// the character in the file + std::shared_ptr name; /// name of buffer + }; +} diff --git a/thirdparty/embree-aarch64/common/lexers/streamfilters.h b/thirdparty/embree-aarch64/common/lexers/streamfilters.h new file mode 100644 index 00000000000..25580a77b8e --- /dev/null +++ b/thirdparty/embree-aarch64/common/lexers/streamfilters.h @@ -0,0 +1,39 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stream.h" + +namespace embree +{ + /* removes all line comments from a stream */ + class LineCommentFilter : public Stream + { + public: + LineCommentFilter (const FileName& fileName, const std::string& lineComment) + : cin(new FileStream(fileName)), lineComment(lineComment) {} + LineCommentFilter (Ref > cin, const std::string& lineComment) + : cin(cin), lineComment(lineComment) {} + + ParseLocation location() { return cin->loc(); } + + int next() + { + /* look if the line comment starts here */ + for (size_t j=0; jpeek() != lineComment[j]) { cin->unget(j); goto not_found; } + cin->get(); + } + /* eat all characters until the end of the line (or file) */ + while (cin->peek() != '\n' && cin->peek() != EOF) cin->get(); + + not_found: + return cin->get(); + } + + private: + Ref > cin; + std::string lineComment; + }; +} diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp new file mode 100644 index 00000000000..98dc80ad59a --- /dev/null +++ b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp @@ -0,0 +1,51 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "stringstream.h" + +namespace embree +{ + static const std::string stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\"; + + /* creates map for fast categorization of characters */ + static void createCharMap(bool map[256], const std::string& chrs) { + for (size_t i=0; i<256; i++) map[i] = false; + for (size_t i=0; i >& cin, const std::string& seps, const std::string& endl, bool multiLine) + : cin(cin), endl(endl), multiLine(multiLine) + { + createCharMap(isSepMap,seps); + createCharMap(isValidCharMap,stringChars); + } + + std::string StringStream::next() + { + /* skip separators */ + while (cin->peek() != EOF) { + if (endl != "" && cin->peek() == '\n') { cin->drop(); return endl; } + if (multiLine && cin->peek() == '\\') { + cin->drop(); + if (cin->peek() == '\n') { cin->drop(); continue; } + cin->unget(); + } + if (!isSeparator(cin->peek())) break; + cin->drop(); + } + + /* parse everything until the next separator */ + std::vector str; str.reserve(64); + while (cin->peek() != EOF && !isSeparator(cin->peek())) { + int c = cin->get(); + // -- GODOT start -- + // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input"); + if (!isValidChar(c)) abort(); + // -- GODOT end -- + str.push_back((char)c); + } + str.push_back(0); + return std::string(str.data()); + } +} diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.h b/thirdparty/embree-aarch64/common/lexers/stringstream.h new file mode 100644 index 00000000000..e6dbd4aecc2 --- /dev/null +++ b/thirdparty/embree-aarch64/common/lexers/stringstream.h @@ -0,0 +1,29 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stream.h" + +namespace embree +{ + /*! simple tokenizer that produces a string stream */ + class StringStream : public Stream + { + public: + StringStream(const Ref >& cin, const std::string& seps = "\n\t\r ", + const std::string& endl = "", bool multiLine = false); + public: + ParseLocation location() { return cin->loc(); } + std::string next(); + private: + __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; } + __forceinline bool isValidChar(unsigned int c) const { return c<256 && isValidCharMap[c]; } + private: + Ref > cin; /*! source character stream */ + bool isSepMap[256]; /*! map for fast classification of separators */ + bool isValidCharMap[256]; /*! map for valid characters */ + std::string endl; /*! the token of the end of line */ + bool multiLine; /*! whether to parse lines wrapped with \ */ + }; +} diff --git a/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp b/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp new file mode 100644 index 00000000000..d05be65862f --- /dev/null +++ b/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp @@ -0,0 +1,181 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "tokenstream.h" +#include "../math/math.h" + +namespace embree +{ + /* shorthands for common sets of characters */ + const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz"; + const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + const std::string TokenStream::numbers = "0123456789"; + const std::string TokenStream::separators = "\n\t\r "; + const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\"; + + /* creates map for fast categorization of characters */ + static void createCharMap(bool map[256], const std::string& chrs) { + for (size_t i=0; i<256; i++) map[i] = false; + for (size_t i=0; i >& cin, //< stream to read from + const std::string& alpha, //< valid characters for identifiers + const std::string& seps, //< characters that act as separators + const std::vector& symbols) //< symbols + : cin(cin), symbols(symbols) + { + createCharMap(isAlphaMap,alpha); + createCharMap(isSepMap,seps); + createCharMap(isStringCharMap,stringChars); + } + + bool TokenStream::decDigits(std::string& str_o) + { + bool ok = false; + std::string str; + if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get(); + while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); } + if (ok) str_o += str; + else cin->unget(str.size()); + return ok; + } + + bool TokenStream::decDigits1(std::string& str_o) + { + bool ok = false; + std::string str; + while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); } + if (ok) str_o += str; else cin->unget(str.size()); + return ok; + } + + bool TokenStream::trySymbol(const std::string& symbol) + { + size_t pos = 0; + while (pos < symbol.size()) { + if (symbol[pos] != cin->peek()) { cin->unget(pos); return false; } + cin->drop(); pos++; + } + return true; + } + + bool TokenStream::trySymbols(Token& token, const ParseLocation& loc) + { + for (size_t i=0; ipeek() == '.') { + str += (char)cin->get(); + decDigits(str); + if (cin->peek() == 'e' || cin->peek() == 'E') { + str += (char)cin->get(); + if (decDigits(str)) ok = true; // 1.[2]E2 + } + else ok = true; // 1.[2] + } + else if (cin->peek() == 'e' || cin->peek() == 'E') { + str += (char)cin->get(); + if (decDigits(str)) ok = true; // 1E2 + } + } + else + { + if (cin->peek() == '.') { + str += (char)cin->get(); + if (decDigits(str)) { + if (cin->peek() == 'e' || cin->peek() == 'E') { + str += (char)cin->get(); + if (decDigits(str)) ok = true; // .3E2 + } + else ok = true; // .3 + } + } + } + if (ok) { + token = Token((float)atof(str.c_str()),loc); + } + else cin->unget(str.size()); + return ok; + } + + bool TokenStream::tryInt(Token& token, const ParseLocation& loc) { + std::string str; + if (decDigits(str)) { + token = Token(atoi(str.c_str()),loc); + return true; + } + return false; + } + + bool TokenStream::tryString(Token& token, const ParseLocation& loc) + { + std::string str; + if (cin->peek() != '\"') return false; + cin->drop(); + while (cin->peek() != '\"') { + const int c = cin->get(); + if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character "+std::string(1,c)+" at "+loc.str()); + str += (char)c; + } + cin->drop(); + token = Token(str,Token::TY_STRING,loc); + return true; + } + + bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc) + { + std::string str; + if (!isAlpha(cin->peek())) return false; + str += (char)cin->get(); + while (isAlphaNum(cin->peek())) str += (char)cin->get(); + token = Token(str,Token::TY_IDENTIFIER,loc); + return true; + } + + void TokenStream::skipSeparators() + { + /* skip separators */ + while (cin->peek() != EOF && isSeparator(cin->peek())) + cin->drop(); + } + + Token TokenStream::next() + { + Token token; + skipSeparators(); + ParseLocation loc = cin->loc(); + if (trySymbols (token,loc)) return token; /**< try to parse a symbol */ + if (tryFloat (token,loc)) return token; /**< try to parse float */ + if (tryInt (token,loc)) return token; /**< try to parse integer */ + if (tryString (token,loc)) return token; /**< try to parse string */ + if (tryIdentifier(token,loc)) return token; /**< try to parse identifier */ + if (cin->peek() == EOF ) return Token(loc); /**< return EOF token */ + return Token((char)cin->get(),loc); /**< return invalid character token */ + } +} diff --git a/thirdparty/embree-aarch64/common/lexers/tokenstream.h b/thirdparty/embree-aarch64/common/lexers/tokenstream.h new file mode 100644 index 00000000000..72a7b4f2f33 --- /dev/null +++ b/thirdparty/embree-aarch64/common/lexers/tokenstream.h @@ -0,0 +1,164 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "stream.h" +#include +#include + +namespace embree +{ + /*! token class */ + class Token + { + public: + + enum Type { TY_EOF, TY_CHAR, TY_INT, TY_FLOAT, TY_IDENTIFIER, TY_STRING, TY_SYMBOL }; + + Token ( const ParseLocation& loc = ParseLocation()) : ty(TY_EOF ), loc(loc) {} + Token (char c, const ParseLocation& loc = ParseLocation()) : ty(TY_CHAR ), c(c), loc(loc) {} + Token (int i, const ParseLocation& loc = ParseLocation()) : ty(TY_INT ), i(i), loc(loc) {} + Token (float f,const ParseLocation& loc = ParseLocation()) : ty(TY_FLOAT), f(f), loc(loc) {} + Token (std::string str, Type ty, const ParseLocation& loc = ParseLocation()) : ty(ty), str(str), loc(loc) {} + + static Token Eof() { return Token(); } + static Token Sym(std::string str) { return Token(str,TY_SYMBOL); } + static Token Str(std::string str) { return Token(str,TY_STRING); } + static Token Id (std::string str) { return Token(str,TY_IDENTIFIER); } + + char Char() const { + if (ty == TY_CHAR) return c; + THROW_RUNTIME_ERROR(loc.str()+": character expected"); + } + + int Int() const { + if (ty == TY_INT) return i; + THROW_RUNTIME_ERROR(loc.str()+": integer expected"); + } + + float Float(bool cast = true) const { + if (ty == TY_FLOAT) return f; + if (ty == TY_INT && cast) return (float)i; + THROW_RUNTIME_ERROR(loc.str()+": float expected"); + } + + std::string Identifier() const { + if (ty == TY_IDENTIFIER) return str; + THROW_RUNTIME_ERROR(loc.str()+": identifier expected"); + } + + std::string String() const { + if (ty == TY_STRING) return str; + THROW_RUNTIME_ERROR(loc.str()+": string expected"); + } + + std::string Symbol() const { + if (ty == TY_SYMBOL) return str; + THROW_RUNTIME_ERROR(loc.str()+": symbol expected"); + } + + const ParseLocation& Location() const { return loc; } + + friend bool operator==(const Token& a, const Token& b) + { + if (a.ty != b.ty) return false; + if (a.ty == TY_CHAR) return a.c == b.c; + if (a.ty == TY_INT) return a.i == b.i; + if (a.ty == TY_FLOAT) return a.f == b.f; + if (a.ty == TY_IDENTIFIER) return a.str == b.str; + if (a.ty == TY_STRING) return a.str == b.str; + if (a.ty == TY_SYMBOL) return a.str == b.str; + return true; + } + + friend bool operator!=(const Token& a, const Token& b) { + return !(a == b); + } + + friend bool operator <( const Token& a, const Token& b ) { + if (a.ty != b.ty) return (int)a.ty < (int)b.ty; + if (a.ty == TY_CHAR) return a.c < b.c; + if (a.ty == TY_INT) return a.i < b.i; + if (a.ty == TY_FLOAT) return a.f < b.f; + if (a.ty == TY_IDENTIFIER) return a.str < b.str; + if (a.ty == TY_STRING) return a.str < b.str; + if (a.ty == TY_SYMBOL) return a.str < b.str; + return false; + } + + friend std::ostream& operator<<(std::ostream& cout, const Token& t) + { + if (t.ty == TY_EOF) return cout << "eof"; + if (t.ty == TY_CHAR) return cout << "Char(" << t.c << ")"; + if (t.ty == TY_INT) return cout << "Int(" << t.i << ")"; + if (t.ty == TY_FLOAT) return cout << "Float(" << t.f << ")"; + if (t.ty == TY_IDENTIFIER) return cout << "Id(" << t.str << ")"; + if (t.ty == TY_STRING) return cout << "String(" << t.str << ")"; + if (t.ty == TY_SYMBOL) return cout << "Symbol(" << t.str << ")"; + return cout << "unknown"; + } + + private: + Type ty; //< the type of the token + union { + char c; //< data for char tokens + int i; //< data for int tokens + float f; //< data for float tokens + }; + std::string str; //< data for string and identifier tokens + ParseLocation loc; //< the location the token is from + }; + + /*! build full tokenizer that takes list of valid characters and keywords */ + class TokenStream : public Stream + { + public: + + /*! shorthands for common sets of characters */ + static const std::string alpha; + static const std::string ALPHA; + static const std::string numbers; + static const std::string separators; + static const std::string stringChars; + + public: + TokenStream(const Ref >& cin, + const std::string& alpha, //< valid characters for identifiers + const std::string& seps, //< characters that act as separators + const std::vector& symbols = std::vector()); //< symbols + public: + ParseLocation location() { return cin->loc(); } + Token next(); + bool trySymbol(const std::string& symbol); + + private: + void skipSeparators(); + bool decDigits(std::string& str); + bool decDigits1(std::string& str); + bool trySymbols(Token& token, const ParseLocation& loc); + bool tryFloat(Token& token, const ParseLocation& loc); + bool tryInt(Token& token, const ParseLocation& loc); + bool tryString(Token& token, const ParseLocation& loc); + bool tryIdentifier(Token& token, const ParseLocation& loc); + + Ref > cin; + bool isSepMap[256]; + bool isAlphaMap[256]; + bool isStringCharMap[256]; + std::vector symbols; + + /*! checks if a character is a separator */ + __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; } + + /*! checks if a character is a number */ + __forceinline bool isDigit(unsigned int c) const { return c >= '0' && c <= '9'; } + + /*! checks if a character is valid inside a string */ + __forceinline bool isStringChar(unsigned int c) const { return c<256 && isStringCharMap[c]; } + + /*! checks if a character is legal for an identifier */ + __forceinline bool isAlpha(unsigned int c) const { return c<256 && isAlphaMap[c]; } + __forceinline bool isAlphaNum(unsigned int c) const { return isAlpha(c) || isDigit(c); } + }; +} diff --git a/thirdparty/embree-aarch64/common/math/AVX2NEON.h b/thirdparty/embree-aarch64/common/math/AVX2NEON.h new file mode 100644 index 00000000000..e8698ac56d1 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/AVX2NEON.h @@ -0,0 +1,986 @@ +#pragma once + +#include "SSE2NEON.h" + + +#define AVX2NEON_ABI static inline __attribute__((always_inline)) + + +struct __m256d; + +struct __m256 { + __m128 lo,hi; + __m256() {} +}; + + + + +struct __m256i { + __m128i lo,hi; + explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {} + operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;} + __m256i() {} +}; + + + + +struct __m256d { + float64x2_t lo,hi; + __m256d() {} + __m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {} + __m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {} +}; + +#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;} + + +#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;} +#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;} + +#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;} + + +#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;} + + + +#define _mm_stream_load_si128 _mm_load_si128 +#define _mm256_stream_load_si256 _mm256_load_si256 + + +AVX2NEON_ABI +__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8) +{ + __m128 res; + for (int i=0;i<4;i++) + { + if (imm8 & (1< +AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b) +{ + float v; + v = 0; + v += (code & 0x10) ? a[0]*b[0] : 0; + v += (code & 0x20) ? a[1]*b[1] : 0; + v += (code & 0x40) ? a[2]*b[2] : 0; + v += (code & 0x80) ? a[3]*b[3] : 0; + float32x4_t res; + res[0] = (code & 0x1) ? v : 0; + res[1] = (code & 0x2) ? v : 0; + res[2] = (code & 0x4) ? v : 0; + res[3] = (code & 0x8) ? v : 0; + return res; +} + +template<> +inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b) +{ + float v; + float32x4_t m = _mm_mul_ps(a,b); + m[3] = 0; + v = vaddvq_f32(m); + return _mm_set1_ps(v); +} + +template<> +inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b) +{ + float v; + float32x4_t m = _mm_mul_ps(a,b); + v = vaddvq_f32(m); + return _mm_set1_ps(v); +} + +#define _mm_dp_ps(a,b,c) dpps_neon((a),(b)) + + + +AVX2NEON_ABI +__m128 _mm_cmpnge_ps (__m128 a, __m128 b) +{ + return __m128(vmvnq_s32(__m128i(_mm_cmpge_ps(a,b)))); +} + + +AVX2NEON_ABI +__m128 _mm_permutevar_ps (__m128 a, __m128i b) +{ + __m128 x; + for (int i=0;i<4;i++) + { + x[i] = a[b[i&3]]; + } + return x; +} + +AVX2NEON_ABI +__m256i _mm256_setzero_si256() +{ + __m256i res; + res.lo = res.hi = vdupq_n_s32(0); + return res; +} + +AVX2NEON_ABI +__m256 _mm256_setzero_ps() +{ + __m256 res; + res.lo = res.hi = vdupq_n_f32(0.0f); + return res; +} + +AVX2NEON_ABI +__m256i _mm256_undefined_si256() +{ + return _mm256_setzero_si256(); +} + +AVX2NEON_ABI +__m256 _mm256_undefined_ps() +{ + return _mm256_setzero_ps(); +} + +CAST_SIMD_TYPE(__m256d,_mm256_castps_pd,__m256,float64x2_t) +CAST_SIMD_TYPE(__m256i,_mm256_castps_si256,__m256,__m128i) +CAST_SIMD_TYPE(__m256, _mm256_castsi256_ps, __m256i,__m128) +CAST_SIMD_TYPE(__m256, _mm256_castpd_ps ,__m256d,__m128) +CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i,float64x2_t) +CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d,__m128i) + + + + +AVX2NEON_ABI +__m128 _mm256_castps256_ps128 (__m256 a) +{ + return a.lo; +} + +AVX2NEON_ABI +__m256i _mm256_castsi128_si256 (__m128i a) +{ + __m256i res; + res.lo = a ; + res.hi = vdupq_n_s32(0); + return res; +} + +AVX2NEON_ABI +__m128i _mm256_castsi256_si128 (__m256i a) +{ + return a.lo; +} + +AVX2NEON_ABI +__m256 _mm256_castps128_ps256 (__m128 a) +{ + __m256 res; + res.lo = a; + res.hi = vdupq_n_f32(0); + return res; +} + + +AVX2NEON_ABI +__m256 _mm256_broadcast_ss (float const * mem_addr) +{ + __m256 res; + res.lo = res.hi = vdupq_n_f32(*mem_addr); + return res; +} + + + +AVX2NEON_ABI +__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) +{ + __m128i lo = {e0,e1,e2,e3}, hi = {e4,e5,e6,e7}; + __m256i res; + res.lo = lo; res.hi = hi; + return res; + +} + +AVX2NEON_ABI +__m256i _mm256_set1_epi32 (int a) +{ + __m256i res; + res.lo = res.hi = vdupq_n_s32(a); + return res; +} + + + + +AVX2NEON_ABI +int _mm256_movemask_ps(const __m256& v) +{ + return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo); +} + +template +AVX2NEON_ABI +__m256 __mm256_permute_ps (const __m256& a) +{ + __m256 res; + res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8); + res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8); + return res; + +} + +#define _mm256_permute_ps(a,c) __mm256_permute_ps(a) + + +template +AVX2NEON_ABI +__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b) +{ + __m256 res; + res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8); + res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8); + return res; + +} + +#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps(a,b) + +AVX2NEON_ABI +__m256i _mm256_set1_epi64x (long long a) +{ + __m256i res; + int64x2_t t = vdupq_n_s64(a); + res.lo = res.hi = __m128i(t); + return res; +} + + +AVX2NEON_ABI +__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8) +{ + __m256 res; + __m128 tmp; + switch (imm8 & 0x7) + { + case 0: tmp = a.lo; break; + case 1: tmp = a.hi; break; + case 2: tmp = b.lo; break; + case 3: tmp = b.hi; break; + } + if (imm8 & 0x8) + tmp = _mm_setzero_ps(); + + + + res.lo = tmp; + imm8 >>= 4; + + switch (imm8 & 0x7) + { + case 0: tmp = a.lo; break; + case 1: tmp = a.hi; break; + case 2: tmp = b.lo; break; + case 3: tmp = b.hi; break; + } + if (imm8 & 0x8) + tmp = _mm_setzero_ps(); + + res.hi = tmp; + + return res; +} + +AVX2NEON_ABI +__m256 _mm256_moveldup_ps (__m256 a) +{ + __m256 res; + res.lo[0] = res.lo[1] = a.lo[0]; + res.lo[2] = res.lo[3] = a.lo[2]; + res.hi[0] = res.hi[1] = a.hi[0]; + res.hi[2] = res.hi[3] = a.hi[2]; + return res; + +} + +AVX2NEON_ABI +__m256 _mm256_movehdup_ps (__m256 a) +{ + __m256 res; + res.lo[0] = res.lo[1] = a.lo[1]; + res.lo[2] = res.lo[3] = a.lo[3]; + res.hi[0] = res.hi[1] = a.hi[1]; + res.hi[2] = res.hi[3] = a.hi[3]; + return res; +} + +AVX2NEON_ABI +__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8) +{ + __m256 res = a; + if (imm8 & 1) res.hi = b; + else res.lo = b; + return res; +} + + +AVX2NEON_ABI +__m128 _mm256_extractf128_ps (__m256 a, const int imm8) +{ + if (imm8 & 1) return a.hi; + return a.lo; +} + + +AVX2NEON_ABI +__m256d _mm256_movedup_pd (__m256d a) +{ + __m256d res; + res.hi = a.hi; + res.lo[0] = res.lo[1] = a.lo[0]; + return res; +} + +AVX2NEON_ABI +__m256i _mm256_abs_epi32(__m256i a) +{ + __m256i res; + res.lo = vabsq_s32(a.lo); + res.hi = vabsq_s32(a.hi); + return res; +} + +UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps) +UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps) +UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps) +UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32) +UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32) + + +BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32) +BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32) +BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32) + +BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32) +BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32) +BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t) +BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t) + +BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps) +BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps) + +BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps) +BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps) +BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps) +BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps) + +BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps) +BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps) +BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps) +BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps) + +BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t) +BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t) +BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t) + + + +BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128) +BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128) +BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128) + + +BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps) +BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps) +TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps) + + +TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps) +TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps) +TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps) +TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps) + + +BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32) +BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32) + + +BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32) +BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32) +BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps) +BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps) +BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps) +BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps) +BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps) +BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps) +BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps) +BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps) +BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps) +BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps) + + +AVX2NEON_ABI +__m256i _mm256_cvtps_epi32 (__m256 a) +{ + __m256i res; + res.lo = _mm_cvtps_epi32(a.lo); + res.hi = _mm_cvtps_epi32(a.hi); + return res; + +} + +AVX2NEON_ABI +__m256i _mm256_cvttps_epi32 (__m256 a) +{ + __m256i res; + res.lo = _mm_cvttps_epi32(a.lo); + res.hi = _mm_cvttps_epi32(a.hi); + return res; + +} + +AVX2NEON_ABI +__m256 _mm256_loadu_ps (float const * mem_addr) +{ + __m256 res; + res.lo = *(__m128 *)(mem_addr + 0); + res.hi = *(__m128 *)(mem_addr + 4); + return res; +} +#define _mm256_load_ps _mm256_loadu_ps + + +AVX2NEON_ABI +int _mm256_testz_ps (const __m256& a, const __m256& b) +{ + __m256 t = a; + if (&a != &b) + t = _mm256_and_ps(a,b); + + __m128i l = vshrq_n_s32(__m128i(t.lo),31); + __m128i h = vshrq_n_s32(__m128i(t.hi),31); + return vaddvq_s32(vaddq_s32(l,h)) == 0; +} + + +AVX2NEON_ABI +__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0) +{ + __m256i res; + int64x2_t t0 = {e0,e1}; + int64x2_t t1 = {e2,e3}; + res.lo = __m128i(t0); + res.hi = __m128i(t1); + return res; +} + +AVX2NEON_ABI +__m256d _mm256_setzero_pd () +{ + __m256d res; + res.lo = res.hi = vdupq_n_f64(0); + return res; +} + +AVX2NEON_ABI +int _mm256_movemask_pd (__m256d a) +{ + int res = 0; + uint64x2_t x; + x = uint64x2_t(a.lo); + res |= (x[0] >> 63) ? 1 : 0; + res |= (x[0] >> 63) ? 2 : 0; + x = uint64x2_t(a.hi); + res |= (x[0] >> 63) ? 4 : 0; + res |= (x[0] >> 63) ? 8 : 0; + return res; +} + +AVX2NEON_ABI +__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b) +{ + __m256i res; + res.lo = __m128i(vceqq_s64(int64x2_t(a.lo),int64x2_t(b.lo))); + res.hi = __m128i(vceqq_s64(int64x2_t(a.hi),int64x2_t(b.hi))); + return res; +} + +AVX2NEON_ABI +__m256i _mm256_cmpeq_pd (__m256d a, __m256d b) +{ + __m256i res; + res.lo = __m128i(vceqq_f64(a.lo,b.lo)); + res.hi = __m128i(vceqq_f64(a.hi,b.hi)); + return res; +} + + +AVX2NEON_ABI +int _mm256_testz_pd (const __m256d& a, const __m256d& b) +{ + __m256d t = a; + + if (&a != &b) + t = _mm256_and_pd(a,b); + + return _mm256_movemask_pd(t) == 0; +} + +AVX2NEON_ABI +__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) +{ + __m256d res; + uint64x2_t t = uint64x2_t(mask.lo); + res.lo[0] = (t[0] >> 63) ? b.lo[0] : a.lo[0]; + res.lo[1] = (t[1] >> 63) ? b.lo[1] : a.lo[1]; + t = uint64x2_t(mask.hi); + res.hi[0] = (t[0] >> 63) ? b.hi[0] : a.hi[0]; + res.hi[1] = (t[1] >> 63) ? b.hi[1] : a.hi[1]; + return res; +} + +template +__m256 __mm256_dp_ps (__m256 a, __m256 b) +{ + __m256 res; + res.lo = _mm_dp_ps(a.lo,b.lo,imm8); + res.hi = _mm_dp_ps(a.hi,b.hi,imm8); + return res; +} + +#define _mm256_dp_ps(a,b,c) __mm256_dp_ps(a,b) + +AVX2NEON_ABI +double _mm256_permute4x64_pd_select(__m256d a, const int imm8) +{ + switch (imm8 & 3) { + case 0: + return a.lo[0]; + case 1: + return a.lo[1]; + case 2: + return a.hi[0]; + case 3: + return a.hi[1]; + } + __builtin_unreachable(); + return 0; +} + +AVX2NEON_ABI +__m256d _mm256_permute4x64_pd (__m256d a, const int imm8) +{ + __m256d res; + res.lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0); + res.lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2); + res.hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4); + res.hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6); + + return res; +} + +AVX2NEON_ABI +__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8) +{ + return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8)); +} + + +AVX2NEON_ABI +__m256i _mm256_loadu_si256 (__m256i const * mem_addr) +{ + __m256i res; + res.lo = *(__m128i *)((int32_t *)mem_addr + 0); + res.hi = *(__m128i *)((int32_t *)mem_addr + 4); + return res; +} + +#define _mm256_load_si256 _mm256_loadu_si256 + +AVX2NEON_ABI +void _mm256_storeu_ps (float * mem_addr, __m256 a) +{ + *(__m128 *)(mem_addr + 0) = a.lo; + *(__m128 *)(mem_addr + 4) = a.hi; + +} + +#define _mm256_store_ps _mm256_storeu_ps +#define _mm256_stream_ps _mm256_storeu_ps + + +AVX2NEON_ABI +void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a) +{ + *(__m128i *)((int *)mem_addr + 0) = a.lo; + *(__m128i *)((int *)mem_addr + 4) = a.hi; + +} + +#define _mm256_store_si256 _mm256_storeu_si256 + + + +AVX2NEON_ABI +__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask) +{ + __m256 res; + res.lo = _mm_maskload_ps(mem_addr,mask.lo); + res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi); + return res; + +} + + +AVX2NEON_ABI +__m256i _mm256_cvtepu8_epi32 (__m128i a) +{ + __m256i res; + uint8x16_t x = uint8x16_t(a); + for (int i=0;i<4;i++) + { + res.lo[i] = x[i]; + res.hi[i] = x[i+4]; + } + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_cvtepi8_epi32 (__m128i a) +{ + __m256i res; + int8x16_t x = int8x16_t(a); + for (int i=0;i<4;i++) + { + res.lo[i] = x[i]; + res.hi[i] = x[i+4]; + } + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_cvtepu16_epi32 (__m128i a) +{ + __m256i res; + uint16x8_t x = uint16x8_t(a); + for (int i=0;i<4;i++) + { + res.lo[i] = x[i]; + res.hi[i] = x[i+4]; + } + return res; +} + +AVX2NEON_ABI +__m256i _mm256_cvtepi16_epi32 (__m128i a) +{ + __m256i res; + int16x8_t x = int16x8_t(a); + for (int i=0;i<4;i++) + { + res.lo[i] = x[i]; + res.hi[i] = x[i+4]; + } + return res; +} + + + +AVX2NEON_ABI +void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a) +{ + _mm_maskstore_epi32(mem_addr,mask.lo,a.lo); + _mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi); +} + +AVX2NEON_ABI +__m256i _mm256_slli_epi32 (__m256i a, int imm8) +{ + __m256i res; + res.lo = _mm_slli_epi32(a.lo,imm8); + res.hi = _mm_slli_epi32(a.hi,imm8); + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_srli_epi32 (__m256i a, int imm8) +{ + __m256i res; + res.lo = _mm_srli_epi32(a.lo,imm8); + res.hi = _mm_srli_epi32(a.hi,imm8); + return res; +} + +AVX2NEON_ABI +__m256i _mm256_srai_epi32 (__m256i a, int imm8) +{ + __m256i res; + res.lo = _mm_srai_epi32(a.lo,imm8); + res.hi = _mm_srai_epi32(a.hi,imm8); + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_sllv_epi32 (__m256i a, __m256i count) +{ + __m256i res; + res.lo = vshlq_s32(a.lo,count.lo); + res.hi = vshlq_s32(a.hi,count.hi); + return res; + +} + + +AVX2NEON_ABI +__m256i _mm256_srav_epi32 (__m256i a, __m256i count) +{ + __m256i res; + res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo)); + res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi)); + return res; + +} + +AVX2NEON_ABI +__m256i _mm256_srlv_epi32 (__m256i a, __m256i count) +{ + __m256i res; + res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo))); + res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi))); + return res; + +} + + +AVX2NEON_ABI +__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8) +{ + return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8)); +} + + +AVX2NEON_ABI +__m128i _mm256_extractf128_si256 (__m256i a, const int imm8) +{ + if (imm8 & 1) return a.hi; + return a.lo; +} + +AVX2NEON_ABI +__m256 _mm256_set1_ps(float x) +{ + __m256 res; + res.lo = res.hi = vdupq_n_f32(x); + return res; +} + +AVX2NEON_ABI +__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) +{ + __m256 res; + res.lo = _mm_set_ps(e3,e2,e1,e0); + res.hi = _mm_set_ps(e7,e6,e5,e4); + return res; +} + +AVX2NEON_ABI +__m256 _mm256_broadcast_ps (__m128 const * mem_addr) +{ + __m256 res; + res.lo = res.hi = *mem_addr; + return res; +} + +AVX2NEON_ABI +__m256 _mm256_cvtepi32_ps (__m256i a) +{ + __m256 res; + res.lo = _mm_cvtepi32_ps(a.lo); + res.hi = _mm_cvtepi32_ps(a.hi); + return res; +} +AVX2NEON_ABI +void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) +{ + for (int i=0;i<4;i++) { + if (mask.lo[i] & 0x80000000) mem_addr[i] = a.lo[i]; + if (mask.hi[i] & 0x80000000) mem_addr[i+4] = a.hi[i]; + } +} + +AVX2NEON_ABI +__m256d _mm256_andnot_pd (__m256d a, __m256d b) +{ + __m256d res; + res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo))); + res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi))); + return res; +} + +AVX2NEON_ABI +__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8) +{ + __m256 res; + res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf); + res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4); + return res; + +} + + +AVX2NEON_ABI +__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8) +{ + __m256i res; + res.lo = _mm_blend_epi32(a.lo,b.lo,imm8 & 0xf); + res.hi = _mm_blend_epi32(a.hi,b.hi,imm8 >> 4); + return res; + +} + +AVX2NEON_ABI +__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale) +{ + __m256i res; + for (int i=0;i<4;i++) + { + res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale)); + res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale)); + } + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale) +{ + __m256i res = _mm256_setzero_si256(); + for (int i=0;i<4;i++) + { + if (mask.lo[i] >> 31) res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale)); + if (mask.hi[i] >> 31) res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale)); + } + + return res; + +} + + diff --git a/thirdparty/embree-aarch64/common/math/SSE2NEON.h b/thirdparty/embree-aarch64/common/math/SSE2NEON.h new file mode 100644 index 00000000000..2013151d31f --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/SSE2NEON.h @@ -0,0 +1,1753 @@ +#ifndef SSE2NEON_H +#define SSE2NEON_H + +// This header file provides a simple API translation layer +// between SSE intrinsics to their corresponding ARM NEON versions +// +// This header file does not (yet) translate *all* of the SSE intrinsics. +// Since this is in support of a specific porting effort, I have only +// included the intrinsics I needed to get my port to work. +// +// Questions/Comments/Feedback send to: jratcliffscarab@gmail.com +// +// If you want to improve or add to this project, send me an +// email and I will probably approve your access to the depot. +// +// Project is located here: +// +// https://github.com/jratcliff63367/sse2neon +// +// Show your appreciation for open source by sending me a bitcoin tip to the following +// address. +// +// TipJar: 1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p : +// https://blockchain.info/address/1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p +// +// +// Contributors to this project are: +// +// John W. Ratcliff : jratcliffscarab@gmail.com +// Brandon Rowlett : browlett@nvidia.com +// Ken Fast : kfast@gdeb.com +// Eric van Beurden : evanbeurden@nvidia.com +// +// +// ********************************************************************************************************************* +// Release notes for January 20, 2017 version: +// +// The unit tests have been refactored. They no longer assert on an error, instead they return a pass/fail condition +// The unit-tests now test 10,000 random float and int values against each intrinsic. +// +// SSE2NEON now supports 95 SSE intrinsics. 39 of them have formal unit tests which have been implemented and +// fully tested on NEON/ARM. The remaining 56 still need unit tests implemented. +// +// A struct is now defined in this header file called 'SIMDVec' which can be used by applications which +// attempt to access the contents of an _m128 struct directly. It is important to note that accessing the __m128 +// struct directly is bad coding practice by Microsoft: @see: https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx +// +// However, some legacy source code may try to access the contents of an __m128 struct directly so the developer +// can use the SIMDVec as an alias for it. Any casting must be done manually by the developer, as you cannot +// cast or otherwise alias the base NEON data type for intrinsic operations. +// +// A bug was found with the _mm_shuffle_ps intrinsic. If the shuffle permutation was not one of the ones with +// a custom/unique implementation causing it to fall through to the default shuffle implementation it was failing +// to return the correct value. This is now fixed. +// +// A bug was found with the _mm_cvtps_epi32 intrinsic. This converts floating point values to integers. +// It was not honoring the correct rounding mode. In SSE the default rounding mode when converting from float to int +// is to use 'round to even' otherwise known as 'bankers rounding'. ARMv7 did not support this feature but ARMv8 does. +// As it stands today, this header file assumes ARMv8. If you are trying to target really old ARM devices, you may get +// a build error. +// +// Support for a number of new intrinsics was added, however, none of them yet have unit-tests to 100% confirm they are +// producing the correct results on NEON. These unit tests will be added as soon as possible. +// +// Here is the list of new instrinsics which have been added: +// +// _mm_cvtss_f32 : extracts the lower order floating point value from the parameter +// _mm_add_ss : adds the scalar single - precision floating point values of a and b +// _mm_div_ps : Divides the four single - precision, floating - point values of a and b. +// _mm_div_ss : Divides the scalar single - precision floating point value of a by b. +// _mm_sqrt_ss : Computes the approximation of the square root of the scalar single - precision floating point value of in. +// _mm_rsqrt_ps : Computes the approximations of the reciprocal square roots of the four single - precision floating point values of in. +// _mm_comilt_ss : Compares the lower single - precision floating point scalar values of a and b using a less than operation +// _mm_comigt_ss : Compares the lower single - precision floating point scalar values of a and b using a greater than operation. +// _mm_comile_ss : Compares the lower single - precision floating point scalar values of a and b using a less than or equal operation. +// _mm_comige_ss : Compares the lower single - precision floating point scalar values of a and b using a greater than or equal operation. +// _mm_comieq_ss : Compares the lower single - precision floating point scalar values of a and b using an equality operation. +// _mm_comineq_s : Compares the lower single - precision floating point scalar values of a and b using an inequality operation +// _mm_unpackhi_epi8 : Interleaves the upper 8 signed or unsigned 8 - bit integers in a with the upper 8 signed or unsigned 8 - bit integers in b. +// _mm_unpackhi_epi16: Interleaves the upper 4 signed or unsigned 16 - bit integers in a with the upper 4 signed or unsigned 16 - bit integers in b. +// +// ********************************************************************************************************************* +/* +** The MIT license: +** +** Permission is hereby granted, free of charge, to any person obtaining a copy +** of this software and associated documentation files (the "Software"), to deal +** in the Software without restriction, including without limitation the rights +** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +** copies of the Software, and to permit persons to whom the Software is furnished +** to do so, subject to the following conditions: +** +** The above copyright notice and this permission notice shall be included in all +** copies or substantial portions of the Software. + +** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +** WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#pragma once + +#define GCC 1 +#define ENABLE_CPP_VERSION 0 + +// enable precise emulation of _mm_min_ps and _mm_max_ps? +// This would slow down the computation a bit, but gives consistent result with x86 SSE2. +// (e.g. would solve a hole or NaN pixel in the rendering result) +#define USE_PRECISE_MINMAX_IMPLEMENTATION (1) + +#if GCC +#define FORCE_INLINE inline __attribute__((always_inline)) +#define ALIGN_STRUCT(x) __attribute__((aligned(x))) +#else +#define FORCE_INLINE inline +#define ALIGN_STRUCT(x) __declspec(align(x)) +#endif + +#include +#include "arm_neon.h" +#if defined(__aarch64__) +#include "constants.h" +#endif + + +#if !defined(__has_builtin) +#define __has_builtin(x) (0) +#endif + +/*******************************************************/ +/* MACRO for shuffle parameter for _mm_shuffle_ps(). */ +/* Argument fp3 is a digit[0123] that represents the fp*/ +/* from argument "b" of mm_shuffle_ps that will be */ +/* placed in fp3 of result. fp2 is the same for fp2 in */ +/* result. fp1 is a digit[0123] that represents the fp */ +/* from argument "a" of mm_shuffle_ps that will be */ +/* places in fp1 of result. fp0 is the same for fp0 of */ +/* result */ +/*******************************************************/ +#if defined(__aarch64__) +#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3), (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3), (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } ) +#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3), (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3), (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*4)+16+3) } ) +#endif + +#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) (((fp3) << 6) | ((fp2) << 4) | \ + ((fp1) << 2) | ((fp0))) + +typedef float32x4_t __m128; +typedef int32x4_t __m128i; + +// union intended to allow direct access to an __m128 variable using the names that the MSVC +// compiler provides. This union should really only be used when trying to access the members +// of the vector as integer values. GCC/clang allow native access to the float members through +// a simple array access operator (in C since 4.6, in C++ since 4.8). +// +// Ideally direct accesses to SIMD vectors should not be used since it can cause a performance +// hit. If it really is needed however, the original __m128 variable can be aliased with a +// pointer to this union and used to access individual components. The use of this union should +// be hidden behind a macro that is used throughout the codebase to access the members instead +// of always declaring this type of variable. +typedef union ALIGN_STRUCT(16) SIMDVec +{ + float m128_f32[4]; // as floats - do not to use this. Added for convenience. + int8_t m128_i8[16]; // as signed 8-bit integers. + int16_t m128_i16[8]; // as signed 16-bit integers. + int32_t m128_i32[4]; // as signed 32-bit integers. + int64_t m128_i64[2]; // as signed 64-bit integers. + uint8_t m128_u8[16]; // as unsigned 8-bit integers. + uint16_t m128_u16[8]; // as unsigned 16-bit integers. + uint32_t m128_u32[4]; // as unsigned 32-bit integers. + uint64_t m128_u64[2]; // as unsigned 64-bit integers. + double m128_f64[2]; // as signed double +} SIMDVec; + +// ****************************************** +// CPU stuff +// ****************************************** + +typedef SIMDVec __m128d; + +#include + +#ifndef _MM_MASK_MASK +#define _MM_MASK_MASK 0x1f80 +#define _MM_MASK_DIV_ZERO 0x200 +#define _MM_FLUSH_ZERO_ON 0x8000 +#define _MM_DENORMALS_ZERO_ON 0x40 +#define _MM_MASK_DENORM 0x100 +#endif +#define _MM_SET_EXCEPTION_MASK(x) +#define _MM_SET_FLUSH_ZERO_MODE(x) +#define _MM_SET_DENORMALS_ZERO_MODE(x) + +FORCE_INLINE void _mm_pause() +{ +} + +FORCE_INLINE void _mm_mfence() +{ + __sync_synchronize(); +} + +#define _MM_HINT_T0 3 +#define _MM_HINT_T1 2 +#define _MM_HINT_T2 1 +#define _MM_HINT_NTA 0 + +FORCE_INLINE void _mm_prefetch(const void* ptr, unsigned int level) +{ + __builtin_prefetch(ptr); + +} + +FORCE_INLINE void* _mm_malloc(int size, int align) +{ + void *ptr; + // align must be multiple of sizeof(void *) for posix_memalign. + if (align < sizeof(void *)) { + align = sizeof(void *); + } + + if ((align % sizeof(void *)) != 0) { + // fallback to malloc + ptr = malloc(size); + } else { + if (posix_memalign(&ptr, align, size)) { + return 0; + } + } + + return ptr; +} + +FORCE_INLINE void _mm_free(void* ptr) +{ + free(ptr); +} + +FORCE_INLINE int _mm_getcsr() +{ + return 0; +} + +FORCE_INLINE void _mm_setcsr(int val) +{ + return; +} + +// ****************************************** +// Set/get methods +// ****************************************** + +// extracts the lower order floating point value from the parameter : https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396 +#if defined(__aarch64__) +FORCE_INLINE float _mm_cvtss_f32(const __m128& x) +{ + return x[0]; +} +#else +FORCE_INLINE float _mm_cvtss_f32(__m128 a) +{ + return vgetq_lane_f32(a, 0); +} +#endif + +// Sets the 128-bit value to zero https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx +FORCE_INLINE __m128i _mm_setzero_si128() +{ + return vdupq_n_s32(0); +} + +// Clears the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setzero_ps(void) +{ + return vdupq_n_f32(0); +} + +// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set1_ps(float _w) +{ + return vdupq_n_f32(_w); +} + +// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set_ps1(float _w) +{ + return vdupq_n_f32(_w); +} + +// Sets the four single-precision, floating-point values to the four inputs. https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx +#if defined(__aarch64__) +FORCE_INLINE __m128 _mm_set_ps(const float w, const float z, const float y, const float x) +{ + float32x4_t t = { x, y, z, w }; + return t; +} + +// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setr_ps(const float w, const float z , const float y , const float x ) +{ + float32x4_t t = { w, z, y, x }; + return t; +} +#else +FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) +{ + float __attribute__((aligned(16))) data[4] = { x, y, z, w }; + return vld1q_f32(data); +} + +// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setr_ps(float w, float z , float y , float x ) +{ + float __attribute__ ((aligned (16))) data[4] = { w, z, y, x }; + return vld1q_f32(data); +} +#endif + +// Sets the 4 signed 32-bit integer values to i. https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set1_epi32(int _i) +{ + return vdupq_n_s32(_i); +} + +//Set the first lane to of 4 signed single-position, floating-point number to w +#if defined(__aarch64__) +FORCE_INLINE __m128 _mm_set_ss(float _w) +{ + float32x4_t res = {_w, 0, 0, 0}; + return res; +} + +// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) +{ + int32x4_t t = {i0,i1,i2,i3}; + return t; +} +#else +FORCE_INLINE __m128 _mm_set_ss(float _w) +{ + __m128 val = _mm_setzero_ps(); + return vsetq_lane_f32(_w, val, 0); +} + +// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) +{ + int32_t __attribute__((aligned(16))) data[4] = { i0, i1, i2, i3 }; + return vld1q_s32(data); +} +#endif + +// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx +FORCE_INLINE void _mm_store_ps(float *p, __m128 a) +{ + vst1q_f32(p, a); +} + +// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx +FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) +{ + vst1q_f32(p, a); +} + +FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t*) p,a); +} + +// Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx +FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a ) +{ + vst1q_s32((int32_t*) p,a); +} + +// Stores the lower single - precision, floating - point value. https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx +FORCE_INLINE void _mm_store_ss(float *p, __m128 a) +{ + vst1q_lane_f32(p, a, 0); +} + +// Reads the lower 64 bits of b and stores them into the lower 64 bits of a. https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx +FORCE_INLINE void _mm_storel_epi64(__m128i* a, __m128i b) +{ + *a = (__m128i)vsetq_lane_s64((int64_t)vget_low_s32(b), *(int64x2_t*)a, 0); +} + +// Loads a single single-precision, floating-point value, copying it into all four words https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load1_ps(const float * p) +{ + return vld1q_dup_f32(p); +} + +// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load_ps(const float * p) +{ + return vld1q_f32(p); +} + +// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_loadu_ps(const float * p) +{ + // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are equivalent for neon + return vld1q_f32(p); +} + +// Loads an single - precision, floating - point value into the low word and clears the upper three words. https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_load_ss(const float * p) +{ + __m128 result = vdupq_n_f32(0); + return vsetq_lane_f32(*p, result, 0); +} + +FORCE_INLINE __m128i _mm_loadu_si128(__m128i *p) +{ + return (__m128i)vld1q_s32((const int32_t*) p); +} + + +// ****************************************** +// Logic/Binary operations +// ****************************************** + +// Compares for inequality. https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) +{ + return (__m128)vmvnq_s32((__m128i)vceqq_f32(a, b)); +} + +// Computes the bitwise AND-NOT of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx +FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) +{ + return (__m128)vbicq_s32((__m128i)b, (__m128i)a); // *NOTE* argument swap +} + +// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the 128-bit value in a. https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) +{ + return (__m128i)vbicq_s32(b, a); // *NOTE* argument swap +} + +// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) +{ + return (__m128i)vandq_s32(a, b); +} + +// Computes the bitwise AND of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) +{ + return (__m128)vandq_s32((__m128i)a, (__m128i)b); +} + +// Computes the bitwise OR of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx +FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) +{ + return (__m128)vorrq_s32((__m128i)a, (__m128i)b); +} + +// Computes bitwise EXOR (exclusive-or) of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) +{ + return (__m128)veorq_s32((__m128i)a, (__m128i)b); +} + +// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx +FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) +{ + return (__m128i)vorrq_s32(a, b); +} + +// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx +FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) +{ + return veorq_s32(a, b); +} + +// NEON does not provide this method +// Creates a 4-bit mask from the most significant bits of the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_ps(__m128 a) +{ +#if ENABLE_CPP_VERSION // I am not yet convinced that the NEON version is faster than the C version of this + uint32x4_t &ia = *(uint32x4_t *)&a; + return (ia[0] >> 31) | ((ia[1] >> 30) & 2) | ((ia[2] >> 29) & 4) | ((ia[3] >> 28) & 8); +#else + +#if defined(__aarch64__) + uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask); + return vaddvq_u32(t2); +#else + static const uint32x4_t movemask = { 1, 2, 4, 8 }; + static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; + uint32x4_t t0 = vreinterpretq_u32_f32(a); + uint32x4_t t1 = vtstq_u32(t0, highbit); + uint32x4_t t2 = vandq_u32(t1, movemask); + uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2)); + return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1); +#endif + +#endif +} + +#if defined(__aarch64__) +FORCE_INLINE int _mm_movemask_popcnt_ps(__m128 a) +{ + uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask); + t2 = vreinterpretq_u32_u8(vcntq_u8(vreinterpretq_u8_u32(t2))); + return vaddvq_u32(t2); + +} +#endif + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of b and places it into the high end of the result. +FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) +{ + return vcombine_f32(vget_high_f32(a), vget_low_f32(b)); +} + +// takes the lower two 32-bit values from a and swaps them and places in high end of result +// takes the higher two 32 bit values from b and swaps them and places in low end of result. +FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) +{ + return vcombine_f32(vrev64_f32(vget_low_f32(a)), vrev64_f32(vget_high_f32(b))); +} + +// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the high +FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) +{ + return vcombine_f32(vget_low_f32(a), vget_high_f32(b)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) +{ + return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 1)), vdup_n_f32(vgetq_lane_f32(b, 0))); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) +{ + return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 2)), vdup_n_f32(vgetq_lane_f32(b, 0))); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) +{ + return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 0)), vdup_n_f32(vgetq_lane_f32(b, 2))); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) +{ + float32_t a0 = vgetq_lane_f32(a, 0); + float32_t a2 = vgetq_lane_f32(a, 2); + float32x2_t aVal = vdup_n_f32(a2); + aVal = vset_lane_f32(a0, aVal, 1); + return vcombine_f32(aVal, vget_high_f32(b)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) +{ + return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 3)), vdup_n_f32(vgetq_lane_f32(b, 1))); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) +{ + float32_t b0 = vgetq_lane_f32(b, 0); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t bVal = vdup_n_f32(b0); + bVal = vset_lane_f32(b2, bVal, 1); + return vcombine_f32(vget_low_f32(a), bVal); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) +{ + float32_t b0 = vgetq_lane_f32(b, 0); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t bVal = vdup_n_f32(b0); + bVal = vset_lane_f32(b2, bVal, 1); + return vcombine_f32(vrev64_f32(vget_low_f32(a)), bVal); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) +{ + float32_t b0 = vgetq_lane_f32(b, 0); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t bVal = vdup_n_f32(b0); + bVal = vset_lane_f32(b2, bVal, 1); + return vcombine_f32(vget_high_f32(a), bVal); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) +{ + float32x2_t a21 = vget_high_f32(vextq_f32(a, a, 3)); + float32x2_t b03 = vget_low_f32(vextq_f32(b, b, 3)); + return vcombine_f32(a21, b03); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) +{ + float32x2_t a03 = vget_low_f32(vextq_f32(a, a, 3)); + float32x2_t b21 = vget_high_f32(vextq_f32(b, b, 3)); + return vcombine_f32(a03, b21); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(a); + float32x2_t b10 = vget_low_f32(b); + return vcombine_f32(a10, b10); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(a)); + float32x2_t b10 = vget_low_f32(b); + return vcombine_f32(a01, b10); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(a)); + float32x2_t b01 = vrev64_f32(vget_low_f32(b)); + return vcombine_f32(a01, b01); +} + +// NEON does not support a general purpose permute intrinsic +// Currently I am not sure whether the C implementation is faster or slower than the NEON version. +// Note, this has to be expanded as a template because the shuffle value must be an immediate value. +// The same is true on SSE as well. +// Selects four specific single-precision, floating-point values from a and b, based on the mask i. https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx +template +FORCE_INLINE __m128 _mm_shuffle_ps_default(const __m128& a, const __m128& b) +{ +#if ENABLE_CPP_VERSION // I am not convinced that the NEON version is faster than the C version yet. + __m128 ret; + ret[0] = a[i & 0x3]; + ret[1] = a[(i >> 2) & 0x3]; + ret[2] = b[(i >> 4) & 0x03]; + ret[3] = b[(i >> 6) & 0x03]; + return ret; +#else +# if __has_builtin(__builtin_shufflevector) + return __builtin_shufflevector( \ + a, b, (i) & (0x3), ((i) >> 2) & 0x3, + (((i) >> 4) & 0x3) + 4, (((i) >> 6) & 0x3) + 4); +# else + const int i0 = (i >> 0)&0x3; + const int i1 = (i >> 2)&0x3; + const int i2 = (i >> 4)&0x3; + const int i3 = (i >> 6)&0x3; + + if (&a == &b) + { + if (i0 == i1 && i0 == i2 && i0 == i3) + { + return (float32x4_t)vdupq_laneq_f32(a,i0); + } + static const uint8_t tbl[16] = { + (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3, + (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3, + (i2*4) + 0,(i2*4) + 1,(i2*4) + 2,(i2*4) + 3, + (i3*4) + 0,(i3*4) + 1,(i3*4) + 2,(i3*4) + 3 + }; + + return (float32x4_t)vqtbl1q_s8(int8x16_t(b),*(uint8x16_t *)tbl); + + } + else + { + + static const uint8_t tbl[16] = { + (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3, + (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3, + (i2*4) + 0 + 16,(i2*4) + 1 + 16,(i2*4) + 2 + 16,(i2*4) + 3 + 16, + (i3*4) + 0 + 16,(i3*4) + 1 + 16,(i3*4) + 2 + 16,(i3*4) + 3 + 16 + }; + + return float32x4_t(vqtbl2q_s8((int8x16x2_t){int8x16_t(a),int8x16_t(b)},*(uint8x16_t *)tbl)); + } +# endif //builtin(shufflevector) +#endif +} + +template +FORCE_INLINE __m128 _mm_shuffle_ps_function(const __m128& a, const __m128& b) +{ + switch (i) + { + case _MM_SHUFFLE(1, 0, 3, 2): + return _mm_shuffle_ps_1032(a, b); + break; + case _MM_SHUFFLE(2, 3, 0, 1): + return _mm_shuffle_ps_2301(a, b); + break; + case _MM_SHUFFLE(3, 2, 1, 0): + return _mm_shuffle_ps_3210(a, b); + break; + case _MM_SHUFFLE(0, 0, 1, 1): + return _mm_shuffle_ps_0011(a, b); + break; + case _MM_SHUFFLE(0, 0, 2, 2): + return _mm_shuffle_ps_0022(a, b); + break; + case _MM_SHUFFLE(2, 2, 0, 0): + return _mm_shuffle_ps_2200(a, b); + break; + case _MM_SHUFFLE(3, 2, 0, 2): + return _mm_shuffle_ps_3202(a, b); + break; + case _MM_SHUFFLE(1, 1, 3, 3): + return _mm_shuffle_ps_1133(a, b); + break; + case _MM_SHUFFLE(2, 0, 1, 0): + return _mm_shuffle_ps_2010(a, b); + break; + case _MM_SHUFFLE(2, 0, 0, 1): + return _mm_shuffle_ps_2001(a, b); + break; + case _MM_SHUFFLE(2, 0, 3, 2): + return _mm_shuffle_ps_2032(a, b); + break; + case _MM_SHUFFLE(0, 3, 2, 1): + return _mm_shuffle_ps_0321(a, b); + break; + case _MM_SHUFFLE(2, 1, 0, 3): + return _mm_shuffle_ps_2103(a, b); + break; + case _MM_SHUFFLE(1, 0, 1, 0): + return _mm_shuffle_ps_1010(a, b); + break; + case _MM_SHUFFLE(1, 0, 0, 1): + return _mm_shuffle_ps_1001(a, b); + break; + case _MM_SHUFFLE(0, 1, 0, 1): + return _mm_shuffle_ps_0101(a, b); + break; + } + return _mm_shuffle_ps_default(a, b); +} + +# if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_default(a,b) +# else +#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_function(a,b) +#endif + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of b and places it into the high end of the result. +FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a, __m128i b) +{ + return vcombine_s32(vget_high_s32(a), vget_low_s32(b)); +} + +// takes the lower two 32-bit values from a and swaps them and places in low end of result +// takes the higher two 32 bit values from b and swaps them and places in high end of result. +FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a, __m128i b) +{ + return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_high_s32(b))); +} + +// shift a right by 32 bits, and put the lower 32 bits of a into the upper 32 bits of b +// when a and b are the same, rotates the least significant 32 bits into the most signficant 32 bits, and shifts the rest down +FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a, __m128i b) +{ + return vextq_s32(a, b, 1); +} + +// shift a left by 32 bits, and put the upper 32 bits of b into the lower 32 bits of a +// when a and b are the same, rotates the most significant 32 bits into the least signficant 32 bits, and shifts the rest up +FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a, __m128i b) +{ + return vextq_s32(a, b, 3); +} + +// gets the lower 64 bits of a, and places it in the upper 64 bits +// gets the lower 64 bits of b and places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a, __m128i b) +{ + return vcombine_s32(vget_low_s32(a), vget_low_s32(a)); +} + +// gets the lower 64 bits of a, and places it in the upper 64 bits +// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a, __m128i b) +{ + return vcombine_s32(vrev64_s32(vget_low_s32(a)), vget_low_s32(b)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the upper 64 bits +// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a, __m128i b) +{ + return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_low_s32(b))); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a, __m128i b) +{ + return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 1)), vdup_n_s32(vgetq_lane_s32(b, 2))); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a, __m128i b) +{ + return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 2)), vrev64_s32(vget_low_s32(b))); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a, __m128i b) +{ + return vcombine_s32(vget_high_s32(a), vdup_n_s32(vgetq_lane_s32(b, 3))); +} + +template +FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __m128i b) +{ +#if ENABLE_CPP_VERSION + __m128i ret; + ret[0] = a[i & 0x3]; + ret[1] = a[(i >> 2) & 0x3]; + ret[2] = b[(i >> 4) & 0x03]; + ret[3] = b[(i >> 6) & 0x03]; + return ret; +#else + __m128i ret = vmovq_n_s32(vgetq_lane_s32(a, i & 0x3)); + ret = vsetq_lane_s32(vgetq_lane_s32(a, (i >> 2) & 0x3), ret, 1); + ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 4) & 0x3), ret, 2); + ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 6) & 0x3), ret, 3); + return ret; +#endif +} + +template +FORCE_INLINE __m128i _mm_shuffle_epi32_function(__m128i a, __m128i b) +{ + switch (i) + { + case _MM_SHUFFLE(1, 0, 3, 2): return _mm_shuffle_epi_1032(a, b); break; + case _MM_SHUFFLE(2, 3, 0, 1): return _mm_shuffle_epi_2301(a, b); break; + case _MM_SHUFFLE(0, 3, 2, 1): return _mm_shuffle_epi_0321(a, b); break; + case _MM_SHUFFLE(2, 1, 0, 3): return _mm_shuffle_epi_2103(a, b); break; + case _MM_SHUFFLE(1, 0, 1, 0): return _mm_shuffle_epi_1010(a, b); break; + case _MM_SHUFFLE(1, 0, 0, 1): return _mm_shuffle_epi_1001(a, b); break; + case _MM_SHUFFLE(0, 1, 0, 1): return _mm_shuffle_epi_0101(a, b); break; + case _MM_SHUFFLE(2, 2, 1, 1): return _mm_shuffle_epi_2211(a, b); break; + case _MM_SHUFFLE(0, 1, 2, 2): return _mm_shuffle_epi_0122(a, b); break; + case _MM_SHUFFLE(3, 3, 3, 2): return _mm_shuffle_epi_3332(a, b); break; + default: return _mm_shuffle_epi32_default(a, b); + } +} + +template +FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a) +{ + return vdupq_n_s32(vgetq_lane_s32(a, i)); +} + +template +FORCE_INLINE __m128i _mm_shuffle_epi32_single(__m128i a) +{ + switch (i) + { + case _MM_SHUFFLE(0, 0, 0, 0): return _mm_shuffle_epi32_splat<0>(a); break; + case _MM_SHUFFLE(1, 1, 1, 1): return _mm_shuffle_epi32_splat<1>(a); break; + case _MM_SHUFFLE(2, 2, 2, 2): return _mm_shuffle_epi32_splat<2>(a); break; + case _MM_SHUFFLE(3, 3, 3, 3): return _mm_shuffle_epi32_splat<3>(a); break; + default: return _mm_shuffle_epi32_function(a, a); + } +} + +// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx +#define _mm_shuffle_epi32(a,i) _mm_shuffle_epi32_single(a) + +template +FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a) +{ + int16x8_t ret = (int16x8_t)a; + int16x4_t highBits = vget_high_s16(ret); + ret = vsetq_lane_s16(vget_lane_s16(highBits, i & 0x3), ret, 4); + ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 2) & 0x3), ret, 5); + ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 4) & 0x3), ret, 6); + ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 6) & 0x3), ret, 7); + return (__m128i)ret; +} + +// Shuffles the upper 4 signed or unsigned 16 - bit integers in a as specified by imm. https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx +#define _mm_shufflehi_epi16(a,i) _mm_shufflehi_epi16_function(a) + +// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while shifting in zeros. : https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx +//#define _mm_slli_epi32(a, imm) (__m128i)vshlq_n_s32(a,imm) + +// Based on SIMDe +FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, const int imm8) +{ +#if defined(__aarch64__) + const int32x4_t s = vdupq_n_s32(imm8); + return vshlq_s32(a, s); +#else + int32_t __attribute__((aligned(16))) data[4]; + vst1q_s32(data, a); + const int s = (imm8 > 31) ? 0 : imm8; + data[0] = data[0] << s; + data[1] = data[1] << s; + data[2] = data[2] << s; + data[3] = data[3] << s; + + return vld1q_s32(data); +#endif +} + + +//Shifts the 4 signed or unsigned 32-bit integers in a right by count bits while shifting in zeros. https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx +//#define _mm_srli_epi32( a, imm ) (__m128i)vshrq_n_u32((uint32x4_t)a, imm) + +// Based on SIMDe +FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, const int imm8) +{ +#if defined(__aarch64__) + const int shift = (imm8 > 31) ? 0 : imm8; // Unfortunately, we need to check for this case for embree. + const int32x4_t s = vdupq_n_s32(-shift); + return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(a), s)); +#else + int32_t __attribute__((aligned(16))) data[4]; + vst1q_s32(data, a); + + const int s = (imm8 > 31) ? 0 : imm8; + + data[0] = data[0] >> s; + data[1] = data[1] >> s; + data[2] = data[2] >> s; + data[3] = data[3] >> s; + + return vld1q_s32(data); +#endif +} + + +// Shifts the 4 signed 32 - bit integers in a right by count bits while shifting in the sign bit. https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx +//#define _mm_srai_epi32( a, imm ) vshrq_n_s32(a, imm) + +// Based on SIMDe +FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, const int imm8) +{ +#if defined(__aarch64__) + const int32x4_t s = vdupq_n_s32(-imm8); + return vshlq_s32(a, s); +#else + int32_t __attribute__((aligned(16))) data[4]; + vst1q_s32(data, a); + const uint32_t m = (uint32_t) ((~0U) << (32 - imm8)); + + for (int i = 0; i < 4; i++) { + uint32_t is_neg = ((uint32_t) (((data[i]) >> 31))); + data[i] = (data[i] >> imm8) | (m * is_neg); + } + + return vld1q_s32(data); +#endif +} + +// Shifts the 128 - bit value in a right by imm bytes while shifting in zeros.imm must be an immediate. https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx +//#define _mm_srli_si128( a, imm ) (__m128i)vmaxq_s8((int8x16_t)a, vextq_s8((int8x16_t)a, vdupq_n_s8(0), imm)) +#define _mm_srli_si128( a, imm ) (__m128i)vextq_s8((int8x16_t)a, vdupq_n_s8(0), (imm)) + +// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm must be an immediate. https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx +#define _mm_slli_si128( a, imm ) (__m128i)vextq_s8(vdupq_n_s8(0), (int8x16_t)a, 16 - (imm)) + +// NEON does not provide a version of this function, here is an article about some ways to repro the results. +// http://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon +// Creates a 16-bit mask from the most significant bits of the 16 signed or unsigned 8-bit integers in a and zero extends the upper bits. https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_epi8(__m128i _a) +{ + uint8x16_t input = (uint8x16_t)_a; + const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 }; + uint8x8_t mask_and = vdup_n_u8(0x80); + int8x8_t mask_shift = vld1_s8(xr); + + uint8x8_t lo = vget_low_u8(input); + uint8x8_t hi = vget_high_u8(input); + + lo = vand_u8(lo, mask_and); + lo = vshl_u8(lo, mask_shift); + + hi = vand_u8(hi, mask_and); + hi = vshl_u8(hi, mask_shift); + + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + lo = vpadd_u8(lo, lo); + + hi = vpadd_u8(hi, hi); + hi = vpadd_u8(hi, hi); + hi = vpadd_u8(hi, hi); + + return ((hi[0] << 8) | (lo[0] & 0xFF)); +} + + +// ****************************************** +// Math operations +// ****************************************** + +// Subtracts the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) +{ + return vsubq_f32(a, b); +} + +FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) +{ + return vsubq_f32(a, b); +} + +// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or unsigned 32-bit integers of a. https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx +FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) +{ + return vsubq_s32(a, b); +} + +// Adds the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) +{ + return vaddq_f32(a, b); +} + +// adds the scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) +{ + const float32_t b0 = vgetq_lane_f32(b, 0); + float32x4_t value = vdupq_n_f32(0); + + //the upper values in the result must be the remnants of . + value = vsetq_lane_f32(b0, value, 0); + return vaddq_f32(a, value); +} + +// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) +{ + return vaddq_s32(a, b); +} + +// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) +{ + return (__m128i)vaddq_s16((int16x8_t)a, (int16x8_t)b); +} + +// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or unsigned 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) +{ + return (__m128i)vmulq_s16((int16x8_t)a, (int16x8_t)b); +} + +// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or unsigned 32-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi32 (__m128i a, __m128i b) +{ + return (__m128i)vmulq_s32((int32x4_t)a,(int32x4_t)b); +} + +// Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx +FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + return vmulq_f32(a, b); +} + +FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) +{ + return vmulq_f32(a, b); +} + +// Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx +FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) +{ +#if defined(BUILD_IOS) + return vdivq_f32(vdupq_n_f32(1.0f),in); + +#endif + // Get an initial estimate of 1/in. + float32x4_t reciprocal = vrecpeq_f32(in); + + // We only return estimated 1/in. + // Newton-Raphon iteration shold be done in the outside of _mm_rcp_ps(). + + // TODO(LTE): We could delete these ifdef? + reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal); + return reciprocal; + +} + +FORCE_INLINE __m128 _mm_rcp_ss(__m128 in) +{ + float32x4_t value; + float32x4_t result = in; + + value = _mm_rcp_ps(in); + return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); +} + +// Divides the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) +{ +#if defined(BUILD_IOS) + return vdivq_f32(a,b); +#else + float32x4_t reciprocal = _mm_rcp_ps(b); + + reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); + + // Add one more round of newton-raphson since NEON's reciprocal estimation has less accuracy compared to SSE2's rcp. + reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); + + // Another round for safety + reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); + + + return vmulq_f32(a, reciprocal); +#endif +} + +// Divides the scalar single-precision floating point value of a by b. https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) +{ + float32x4_t value; + float32x4_t result = a; + value = _mm_div_ps(a, b); + return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); +} + +// Computes the approximations of the reciprocal square roots of the four single-precision floating point values of in. https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx +FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) +{ + + float32x4_t value = vrsqrteq_f32(in); + + // TODO: We must debug and ensure that rsqrt(0) and rsqrt(-0) yield proper values. + // Related code snippets can be found here: https://cpp.hotexamples.com/examples/-/-/vrsqrteq_f32/cpp-vrsqrteq_f32-function-examples.html + // If we adapt this function, we might be able to avoid special zero treatment in _mm_sqrt_ps + + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); + + // one more round to get better precision + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); + + // another round for safety + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value)); + + return value; +} + +FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) +{ + float32x4_t result = in; + + __m128 value = _mm_rsqrt_ps(in); + + return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); +} + + +// Computes the approximations of square roots of the four single-precision, floating-point values of a. First computes reciprocal square roots and then reciprocals of the four values. https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) +{ +#if defined(BUILD_IOS) + return vsqrtq_f32(in); +#else + __m128 reciprocal = _mm_rsqrt_ps(in); + + // We must treat sqrt(in == 0) in a special way. At this point reciprocal contains gargabe due to vrsqrteq_f32(0) returning +inf. + // We assign 0 to reciprocal wherever required. + const float32x4_t vzero = vdupq_n_f32(0.0f); + const uint32x4_t mask = vceqq_f32(in, vzero); + reciprocal = vbslq_f32(mask, vzero, reciprocal); + + // sqrt(x) = x * (1 / sqrt(x)) + return vmulq_f32(in, reciprocal); +#endif +} + +// Computes the approximation of the square root of the scalar single-precision floating point value of in. https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) +{ + float32x4_t value; + float32x4_t result = in; + + value = _mm_sqrt_ps(in); + return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); +} + + +// Computes the maximums of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) +{ +#if USE_PRECISE_MINMAX_IMPLEMENTATION + return vbslq_f32(vcltq_f32(b,a),a,b); +#else + // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels) + return vmaxq_f32(a, b); +#endif +} + +// Computes the minima of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) +{ +#if USE_PRECISE_MINMAX_IMPLEMENTATION + return vbslq_f32(vcltq_f32(a,b),a,b); +#else + // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels) + return vminq_f32(a, b); +#endif +} + +// Computes the maximum of the two lower scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) +{ + float32x4_t value; + float32x4_t result = a; + + value = _mm_max_ps(a, b); + return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); +} + +// Computes the minimum of the two lower scalar single-precision floating point values of a and b. https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) +{ + float32x4_t value; + float32x4_t result = a; + + + value = _mm_min_ps(a, b); + return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0); +} + +// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) +{ + return (__m128i)vminq_s16((int16x8_t)a, (int16x8_t)b); +} + +// epi versions of min/max +// Computes the pariwise maximums of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b ) +{ + return vmaxq_s32(a,b); +} + +// Computes the pariwise minima of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b ) +{ + return vminq_s32(a,b); +} + +// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) +{ + int16x8_t ret = vqdmulhq_s16((int16x8_t)a, (int16x8_t)b); + ret = vshrq_n_s16(ret, 1); + return (__m128i)ret; +} + +// Computes pairwise add of each argument as single-precision, floating-point values a and b. +//https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx +FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b ) +{ +#if defined(__aarch64__) + return vpaddq_f32(a,b); +#else +// This does not work, no vpaddq... +// return (__m128) vpaddq_f32(a,b); + // + // get two f32x2_t values from a + // do vpadd + // put result in low half of f32x4 result + // + // get two f32x2_t values from b + // do vpadd + // put result in high half of f32x4 result + // + // combine + return vcombine_f32( vpadd_f32( vget_low_f32(a), vget_high_f32(a) ), vpadd_f32( vget_low_f32(b), vget_high_f32(b) ) ); +#endif +} + +// ****************************************** +// Compare operations +// ****************************************** + +// Compares for less than https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) +{ + return (__m128)vcltq_f32(a, b); +} + +FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) +{ + return (__m128) vmvnq_s32((__m128i)_mm_cmplt_ps(a,b)); +} + +// Compares for greater than. https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) +{ + return (__m128)vcgtq_f32(a, b); +} + +FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) +{ + return (__m128) _mm_cmpgt_ps(a,b); +} + + +// Compares for greater than or equal. https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) +{ + return (__m128)vcgeq_f32(a, b); +} + +// Compares for less than or equal. https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) +{ + return (__m128)vcleq_f32(a, b); +} + +// Compares for equality. https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) +{ + return (__m128)vceqq_f32(a, b); +} + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for less than. https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) +{ + return (__m128i)vcltq_s32(a, b); +} + +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + return (__m128i) vceqq_s32(a,b); +} + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) +{ + return (__m128i)vcgtq_s32(a, b); +} + +// Compares the four 32-bit floats in a and b to check if any values are NaN. Ordered compare between each value returns true for "orderable" and false for "not orderable" (NaN). https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx +// see also: +// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean +// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics +FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b ) +{ + // Note: NEON does not have ordered compare builtin + // Need to compare a eq a and b eq b to check for NaN + // Do AND of results to get final + return (__m128) vreinterpretq_f32_u32( vandq_u32( vceqq_f32(a,a), vceqq_f32(b,b) ) ); +} + +// Compares the lower single-precision floating point scalar values of a and b using a less than operation. : https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx +FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) +{ + uint32x4_t value; + + value = vcltq_f32(a, b); + return vgetq_lane_u32(value, 0); +} + +// Compares the lower single-precision floating point scalar values of a and b using a greater than operation. : https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx +FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) +{ + uint32x4_t value; + + value = vcgtq_f32(a, b); + return vgetq_lane_u32(value, 0); +} + +// Compares the lower single-precision floating point scalar values of a and b using a less than or equal operation. : https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx +FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) +{ + uint32x4_t value; + + value = vcleq_f32(a, b); + return vgetq_lane_u32(value, 0); +} + +// Compares the lower single-precision floating point scalar values of a and b using a greater than or equal operation. : https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx +FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) +{ + uint32x4_t value; + + value = vcgeq_f32(a, b); + return vgetq_lane_u32(value, 0); +} + +// Compares the lower single-precision floating point scalar values of a and b using an equality operation. : https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx +FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) +{ + uint32x4_t value; + + value = vceqq_f32(a, b); + return vgetq_lane_u32(value, 0); +} + +// Compares the lower single-precision floating point scalar values of a and b using an inequality operation. : https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx +FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) +{ + uint32x4_t value; + + value = vceqq_f32(a, b); + return !vgetq_lane_u32(value, 0); +} + +// according to the documentation, these intrinsics behave the same as the non-'u' versions. We'll just alias them here. +#define _mm_ucomilt_ss _mm_comilt_ss +#define _mm_ucomile_ss _mm_comile_ss +#define _mm_ucomigt_ss _mm_comigt_ss +#define _mm_ucomige_ss _mm_comige_ss +#define _mm_ucomieq_ss _mm_comieq_ss +#define _mm_ucomineq_ss _mm_comineq_ss + +// ****************************************** +// Conversions +// ****************************************** + +// Converts the four single-precision, floating-point values of a to signed 32-bit integer values using truncate. https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) +{ + return vcvtq_s32_f32(a); +} + +// Converts the four signed 32-bit integer values of a to single-precision, floating-point values https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) +{ + return vcvtq_f32_s32(a); +} + +// Converts the four single-precision, floating-point values of a to signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx +// *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7 does not support! +// It is supported on ARMv8 however. +FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) +{ +#if 1 + return vcvtnq_s32_f32(a); +#else + __m128 half = vdupq_n_f32(0.5f); + const __m128 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(a), 31))); + const __m128 aPlusHalf = vaddq_f32(a, half); + const __m128 aRound = vsubq_f32(aPlusHalf, sign); + return vcvtq_s32_f32(aRound); +#endif +} + +// Moves the least significant 32 bits of a to a 32-bit integer. https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx +FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) +{ + return vgetq_lane_s32(a, 0); +} + +// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, zero extending the upper bits. https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) +{ + __m128i result = vdupq_n_s32(0); + return vsetq_lane_s32(a, result, 0); +} + + +// Applies a type cast to reinterpret four 32-bit floating point values passed in as a 128-bit parameter as packed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514099.aspx +FORCE_INLINE __m128i _mm_castps_si128(__m128 a) +{ +#if defined(__aarch64__) + return (__m128i)a; +#else + return *(const __m128i *)&a; +#endif +} + +// Applies a type cast to reinterpret four 32-bit integers passed in as a 128-bit parameter as packed 32-bit floating point values. https://msdn.microsoft.com/en-us/library/bb514029.aspx +FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) +{ +#if defined(__aarch64__) + return (__m128)a; +#else + return *(const __m128 *)&a; +#endif +} + +// Loads 128-bit value. : https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx +FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) +{ + return vld1q_s32((int32_t *)p); +} + +FORCE_INLINE __m128d _mm_castps_pd(const __m128 a) +{ + return *(const __m128d *)&a; +} + +FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) +{ + return *(const __m128d *)&a; +} +// ****************************************** +// Miscellaneous Operations +// ****************************************** + +// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and saturates. https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) +{ + return (__m128i)vcombine_s8(vqmovn_s16((int16x8_t)a), vqmovn_s16((int16x8_t)b)); +} + +// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned integers and saturates. https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) +{ + return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b)); +} + +// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates. https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) +{ + return (__m128i)vcombine_s16(vqmovn_s32(a), vqmovn_s32(b)); +} + +// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b. https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) +{ + int8x8_t a1 = (int8x8_t)vget_low_s16((int16x8_t)a); + int8x8_t b1 = (int8x8_t)vget_low_s16((int16x8_t)b); + + int8x8x2_t result = vzip_s8(a1, b1); + + return (__m128i)vcombine_s8(result.val[0], result.val[1]); +} + +// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) +{ + int16x4_t a1 = vget_low_s16((int16x8_t)a); + int16x4_t b1 = vget_low_s16((int16x8_t)b); + + int16x4x2_t result = vzip_s16(a1, b1); + + return (__m128i)vcombine_s16(result.val[0], result.val[1]); +} + +// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the lower 2 signed or unsigned 32 - bit integers in b. https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) +{ + int32x2_t a1 = vget_low_s32(a); + int32x2_t b1 = vget_low_s32(b); + + int32x2x2_t result = vzip_s32(a1, b1); + + return vcombine_s32(result.val[0], result.val[1]); +} + +// Selects and interleaves the lower two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) +{ + float32x2x2_t result = vzip_f32(vget_low_f32(a), vget_low_f32(b)); + return vcombine_f32(result.val[0], result.val[1]); +} + +// Selects and interleaves the upper two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) +{ + float32x2x2_t result = vzip_f32(vget_high_f32(a), vget_high_f32(b)); + return vcombine_f32(result.val[0], result.val[1]); +} + +// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b. https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) +{ + int8x8_t a1 = (int8x8_t)vget_high_s16((int16x8_t)a); + int8x8_t b1 = (int8x8_t)vget_high_s16((int16x8_t)b); + + int8x8x2_t result = vzip_s8(a1, b1); + + return (__m128i)vcombine_s8(result.val[0], result.val[1]); +} + +// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) +{ + int16x4_t a1 = vget_high_s16((int16x8_t)a); + int16x4_t b1 = vget_high_s16((int16x8_t)b); + + int16x4x2_t result = vzip_s16(a1, b1); + + return (__m128i)vcombine_s16(result.val[0], result.val[1]); +} + +// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the upper 2 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) +{ + int32x2_t a1 = vget_high_s32(a); + int32x2_t b1 = vget_high_s32(b); + + int32x2x2_t result = vzip_s32(a1, b1); + + return vcombine_s32(result.val[0], result.val[1]); +} + +// Extracts the selected signed or unsigned 16-bit integer from a and zero extends. https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx +#define _mm_extract_epi16( a, imm ) vgetq_lane_s16((int16x8_t)a, imm) + +// ****************************************** +// Streaming Extensions +// ****************************************** + +// Guarantees that every preceding store is globally visible before any subsequent store. https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx +FORCE_INLINE void _mm_sfence(void) +{ + __sync_synchronize(); +} + +// Stores the data in a to the address p without polluting the caches. If the cache line containing address p is already in the cache, the cache will be updated.Address p must be 16 - byte aligned. https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx +FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) +{ + *p = a; +} + +// Cache line containing p is flushed and invalidated from all caches in the coherency domain. : https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx +FORCE_INLINE void _mm_clflush(void const*p) +{ + // no corollary for Neon? +} + +FORCE_INLINE __m128i _mm_set_epi64x(int64_t a, int64_t b) +{ + // Stick to the flipped behavior of x86. + int64_t __attribute__((aligned(16))) data[2] = { b, a }; + return (__m128i)vld1q_s64(data); +} + +FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) +{ + return (__m128i)vmovq_n_s64(_i); +} + +#if defined(__aarch64__) +FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 c) +{ + int32x4_t mask = vshrq_n_s32(__m128i(c),31); + return vbslq_f32( uint32x4_t(mask), b, a); +} + +FORCE_INLINE __m128i _mm_load4epu8_epi32(__m128i *ptr) +{ + uint8x8_t t0 = vld1_u8((uint8_t*)ptr); + uint16x8_t t1 = vmovl_u8(t0); + uint32x4_t t2 = vmovl_u16(vget_low_u16(t1)); + return vreinterpretq_s32_u32(t2); +} + +FORCE_INLINE __m128i _mm_load4epu16_epi32(__m128i *ptr) +{ + uint16x8_t t0 = vld1q_u16((uint16_t*)ptr); + uint32x4_t t1 = vmovl_u16(vget_low_u16(t0)); + return vreinterpretq_s32_u32(t1); +} + +FORCE_INLINE __m128i _mm_load4epi8_f32(__m128i *ptr) +{ + int8x8_t t0 = vld1_s8((int8_t*)ptr); + int16x8_t t1 = vmovl_s8(t0); + int32x4_t t2 = vmovl_s16(vget_low_s16(t1)); + float32x4_t t3 = vcvtq_f32_s32(t2); + return vreinterpretq_s32_f32(t3); +} + +FORCE_INLINE __m128i _mm_load4epu8_f32(__m128i *ptr) +{ + uint8x8_t t0 = vld1_u8((uint8_t*)ptr); + uint16x8_t t1 = vmovl_u8(t0); + uint32x4_t t2 = vmovl_u16(vget_low_u16(t1)); + return vreinterpretq_s32_u32(t2); +} + +FORCE_INLINE __m128i _mm_load4epi16_f32(__m128i *ptr) +{ + int16x8_t t0 = vld1q_s16((int16_t*)ptr); + int32x4_t t1 = vmovl_s16(vget_low_s16(t0)); + float32x4_t t2 = vcvtq_f32_s32(t1); + return vreinterpretq_s32_f32(t2); +} + +FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) +{ + return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b)); +} + +FORCE_INLINE __m128i _mm_stream_load_si128(__m128i* ptr) +{ + // No non-temporal load on a single register on ARM. + return vreinterpretq_s32_u8(vld1q_u8((uint8_t*)ptr)); +} + +FORCE_INLINE void _mm_stream_ps(float* ptr, __m128i a) +{ + // No non-temporal store on a single register on ARM. + vst1q_f32((float*)ptr, vreinterpretq_f32_s32(a)); +} + +FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b))); +} + +FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b))); +} + +FORCE_INLINE __m128 _mm_abs_ps(__m128 a) +{ + return vabsq_f32(a); +} + +FORCE_INLINE __m128 _mm_madd_ps(__m128 a, __m128 b, __m128 c) +{ + return vmlaq_f32(c, a, b); +} + +FORCE_INLINE __m128 _mm_msub_ps(__m128 a, __m128 b, __m128 c) +{ + return vmlsq_f32(c, a, b); +} + +FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) +{ + return vabsq_s32(a); +} +#endif //defined(__aarch64__) + +// Count the number of bits set to 1 in unsigned 32-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 +FORCE_INLINE int _mm_popcnt_u32(unsigned int a) +{ + return (int)vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t)a))); +} + +// Count the number of bits set to 1 in unsigned 64-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 +FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) +{ + return (int64_t)vaddlv_u8(vcnt_u8(vcreate_u8(a))); +} + +#endif diff --git a/thirdparty/embree-aarch64/common/math/affinespace.h b/thirdparty/embree-aarch64/common/math/affinespace.h new file mode 100644 index 00000000000..32452fbe72b --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/affinespace.h @@ -0,0 +1,361 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "linearspace2.h" +#include "linearspace3.h" +#include "quaternion.h" +#include "bbox.h" +#include "vec4.h" + +namespace embree +{ + #define VectorT typename L::Vector + #define ScalarT typename L::Vector::Scalar + + //////////////////////////////////////////////////////////////////////////////// + // Affine Space + //////////////////////////////////////////////////////////////////////////////// + + template + struct AffineSpaceT + { + L l; /*< linear part of affine space */ + VectorT p; /*< affine part of affine space */ + + //////////////////////////////////////////////////////////////////////////////// + // Constructors, Assignment, Cast, Copy Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline AffineSpaceT ( ) { } + __forceinline AffineSpaceT ( const AffineSpaceT& other ) { l = other.l; p = other.p; } + __forceinline AffineSpaceT ( const L & other ) { l = other ; p = VectorT(zero); } + __forceinline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; } + + __forceinline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {} + __forceinline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {} + + template __forceinline AffineSpaceT( const AffineSpaceT& s ) : l(s.l), p(s.p) {} + + //////////////////////////////////////////////////////////////////////////////// + // Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {} + __forceinline AffineSpaceT( OneTy ) : l(one), p(zero) {} + + /*! return matrix for scaling */ + static __forceinline AffineSpaceT scale(const VectorT& s) { return L::scale(s); } + + /*! return matrix for translation */ + static __forceinline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); } + + /*! return matrix for rotation, only in 2D */ + static __forceinline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); } + + /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */ + static __forceinline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); } + + /*! return matrix for rotation around arbitrary axis and point, only in 3D */ + static __forceinline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p); } + + /*! return matrix for looking at given point, only in 3D */ + static __forceinline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) { + VectorT Z = normalize(point-eye); + VectorT U = normalize(cross(up,Z)); + VectorT V = normalize(cross(Z,U)); + return AffineSpaceT(L(U,V,Z),eye); + } + + }; + + // template specialization to get correct identity matrix for type AffineSpace3fa + template<> + __forceinline AffineSpaceT::AffineSpaceT( OneTy ) : l(one), p(0.f, 0.f, 0.f, 1.f) {} + + //////////////////////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline AffineSpaceT operator -( const AffineSpaceT& a ) { return AffineSpaceT(-a.l,-a.p); } + template __forceinline AffineSpaceT operator +( const AffineSpaceT& a ) { return AffineSpaceT(+a.l,+a.p); } + template __forceinline AffineSpaceT rcp( const AffineSpaceT& a ) { L il = rcp(a.l); return AffineSpaceT(il,-(il*a.p)); } + + //////////////////////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline const AffineSpaceT operator +( const AffineSpaceT& a, const AffineSpaceT& b ) { return AffineSpaceT(a.l+b.l,a.p+b.p); } + template __forceinline const AffineSpaceT operator -( const AffineSpaceT& a, const AffineSpaceT& b ) { return AffineSpaceT(a.l-b.l,a.p-b.p); } + + template __forceinline const AffineSpaceT operator *( const ScalarT & a, const AffineSpaceT& b ) { return AffineSpaceT(a*b.l,a*b.p); } + template __forceinline const AffineSpaceT operator *( const AffineSpaceT& a, const AffineSpaceT& b ) { return AffineSpaceT(a.l*b.l,a.l*b.p+a.p); } + template __forceinline const AffineSpaceT operator /( const AffineSpaceT& a, const AffineSpaceT& b ) { return a * rcp(b); } + template __forceinline const AffineSpaceT operator /( const AffineSpaceT& a, const ScalarT & b ) { return a * rcp(b); } + + template __forceinline AffineSpaceT& operator *=( AffineSpaceT& a, const AffineSpaceT& b ) { return a = a * b; } + template __forceinline AffineSpaceT& operator *=( AffineSpaceT& a, const ScalarT & b ) { return a = a * b; } + template __forceinline AffineSpaceT& operator /=( AffineSpaceT& a, const AffineSpaceT& b ) { return a = a / b; } + template __forceinline AffineSpaceT& operator /=( AffineSpaceT& a, const ScalarT & b ) { return a = a / b; } + + template __forceinline VectorT xfmPoint (const AffineSpaceT& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); } + template __forceinline VectorT xfmVector(const AffineSpaceT& m, const VectorT& v) { return xfmVector(m.l,v); } + template __forceinline VectorT xfmNormal(const AffineSpaceT& m, const VectorT& n) { return xfmNormal(m.l,n); } + + __forceinline const BBox xfmBounds(const AffineSpaceT >& m, const BBox& b) + { + BBox3fa dst = empty; + const Vec3fa p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0)); + const Vec3fa p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1)); + const Vec3fa p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2)); + const Vec3fa p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3)); + const Vec3fa p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4)); + const Vec3fa p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5)); + const Vec3fa p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6)); + const Vec3fa p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7)); + return dst; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline bool operator ==( const AffineSpaceT& a, const AffineSpaceT& b ) { return a.l == b.l && a.p == b.p; } + template __forceinline bool operator !=( const AffineSpaceT& a, const AffineSpaceT& b ) { return a.l != b.l || a.p != b.p; } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline AffineSpaceT select ( const typename L::Vector::Scalar::Bool& s, const AffineSpaceT& t, const AffineSpaceT& f ) { + return AffineSpaceT(select(s,t.l,f.l),select(s,t.p,f.p)); + } + + //////////////////////////////////////////////////////////////////////////////// + // Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template static embree_ostream operator<<(embree_ostream cout, const AffineSpaceT& m) { + return cout << "{ l = " << m.l << ", p = " << m.p << " }"; + } + + //////////////////////////////////////////////////////////////////////////////// + // Template Instantiations + //////////////////////////////////////////////////////////////////////////////// + + typedef AffineSpaceT AffineSpace2f; + typedef AffineSpaceT AffineSpace3f; + typedef AffineSpaceT AffineSpace3fa; + typedef AffineSpaceT AffineSpace3fx; + typedef AffineSpaceT AffineSpace3ff; + typedef AffineSpaceT OrthonormalSpace3f; + + template using AffineSpace3vf = AffineSpaceT>>>; + typedef AffineSpaceT>>> AffineSpace3vf4; + typedef AffineSpaceT>>> AffineSpace3vf8; + typedef AffineSpaceT>>> AffineSpace3vf16; + + template using AffineSpace3vff = AffineSpaceT>>>; + typedef AffineSpaceT>>> AffineSpace3vfa4; + typedef AffineSpaceT>>> AffineSpace3vfa8; + typedef AffineSpaceT>>> AffineSpace3vfa16; + + ////////////////////////////////////////////////////////////////////////////// + /// Interpolation + ////////////////////////////////////////////////////////////////////////////// + template + __forceinline AffineSpaceT lerp(const AffineSpaceT& M0, + const AffineSpaceT& M1, + const R& t) + { + return AffineSpaceT(lerp(M0.l,M1.l,t),lerp(M0.p,M1.p,t)); + } + + // slerp interprets the 16 floats of the matrix M = D * R * S as components of + // three matrizes (D, R, S) that are interpolated individually. + template __forceinline AffineSpaceT>> + slerp(const AffineSpaceT>>& M0, + const AffineSpaceT>>& M1, + const T& t) + { + QuaternionT q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w); + QuaternionT q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w); + QuaternionT q = slerp(q0, q1, t); + + AffineSpaceT>> S = lerp(M0, M1, t); + AffineSpaceT>> D(one); + D.p.x = S.l.vx.y; + D.p.y = S.l.vx.z; + D.p.z = S.l.vy.z; + S.l.vx.y = 0; + S.l.vx.z = 0; + S.l.vy.z = 0; + + AffineSpaceT>> R = LinearSpace3>(q); + return D * R * S; + } + + // this is a specialized version for Vec3fa because that does + // not play along nicely with the other templated Vec3/Vec4 types + __forceinline AffineSpace3fa slerp(const AffineSpace3ff& M0, + const AffineSpace3ff& M1, + const float& t) + { + Quaternion3f q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w); + Quaternion3f q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w); + Quaternion3f q = slerp(q0, q1, t); + + AffineSpace3fa S = lerp(M0, M1, t); + AffineSpace3fa D(one); + D.p.x = S.l.vx.y; + D.p.y = S.l.vx.z; + D.p.z = S.l.vy.z; + S.l.vx.y = 0; + S.l.vx.z = 0; + S.l.vy.z = 0; + + AffineSpace3fa R = LinearSpace3fa(q); + return D * R * S; + } + + __forceinline AffineSpace3fa quaternionDecompositionToAffineSpace(const AffineSpace3ff& qd) + { + // compute affine transform from quaternion decomposition + Quaternion3f q(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w); + AffineSpace3fa M = qd; + AffineSpace3fa D(one); + D.p.x = M.l.vx.y; + D.p.y = M.l.vx.z; + D.p.z = M.l.vy.z; + M.l.vx.y = 0; + M.l.vx.z = 0; + M.l.vy.z = 0; + AffineSpace3fa R = LinearSpace3fa(q); + return D * R * M; + } + + __forceinline void quaternionDecomposition(const AffineSpace3ff& qd, Vec3fa& T, Quaternion3f& q, AffineSpace3fa& S) + { + q = Quaternion3f(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w); + S = qd; + T.x = qd.l.vx.y; + T.y = qd.l.vx.z; + T.z = qd.l.vy.z; + S.l.vx.y = 0; + S.l.vx.z = 0; + S.l.vy.z = 0; + } + + __forceinline AffineSpace3fx quaternionDecomposition(Vec3fa const& T, Quaternion3f const& q, AffineSpace3fa const& S) + { + AffineSpace3ff M = S; + M.l.vx.w = q.i; + M.l.vy.w = q.j; + M.l.vz.w = q.k; + M.p.w = q.r; + M.l.vx.y = T.x; + M.l.vx.z = T.y; + M.l.vy.z = T.z; + return M; + } + + struct __aligned(16) QuaternionDecomposition + { + float scale_x = 1.f; + float scale_y = 1.f; + float scale_z = 1.f; + float skew_xy = 0.f; + float skew_xz = 0.f; + float skew_yz = 0.f; + float shift_x = 0.f; + float shift_y = 0.f; + float shift_z = 0.f; + float quaternion_r = 1.f; + float quaternion_i = 0.f; + float quaternion_j = 0.f; + float quaternion_k = 0.f; + float translation_x = 0.f; + float translation_y = 0.f; + float translation_z = 0.f; + }; + + __forceinline QuaternionDecomposition quaternionDecomposition(AffineSpace3ff const& M) + { + QuaternionDecomposition qd; + qd.scale_x = M.l.vx.x; + qd.scale_y = M.l.vy.y; + qd.scale_z = M.l.vz.z; + qd.shift_x = M.p.x; + qd.shift_y = M.p.y; + qd.shift_z = M.p.z; + qd.translation_x = M.l.vx.y; + qd.translation_y = M.l.vx.z; + qd.translation_z = M.l.vy.z; + qd.skew_xy = M.l.vy.x; + qd.skew_xz = M.l.vz.x; + qd.skew_yz = M.l.vz.y; + qd.quaternion_r = M.p.w; + qd.quaternion_i = M.l.vx.w; + qd.quaternion_j = M.l.vy.w; + qd.quaternion_k = M.l.vz.w; + return qd; + } + + //////////////////////////////////////////////////////////////////////////////// + /* + * ! Template Specialization for 2D: return matrix for rotation around point + * (rotation around arbitrarty vector is not meaningful in 2D) + */ + template<> __forceinline + AffineSpace2f AffineSpace2f::rotate(const Vec2f& p, const float& r) { + return translate(+p)*AffineSpace2f(LinearSpace2f::rotate(r))*translate(-p); + } + + //////////////////////////////////////////////////////////////////////////////// + // Similarity Transform + // + // checks, if M is a similarity transformation, i.e if there exists a factor D + // such that for all x,y: distance(Mx, My) = D * distance(x, y) + //////////////////////////////////////////////////////////////////////////////// + __forceinline bool similarityTransform(const AffineSpace3fa& M, float* D) + { + if (D) *D = 0.f; + if (abs(dot(M.l.vx, M.l.vy)) > 1e-5f) return false; + if (abs(dot(M.l.vx, M.l.vz)) > 1e-5f) return false; + if (abs(dot(M.l.vy, M.l.vz)) > 1e-5f) return false; + + const float D_x = dot(M.l.vx, M.l.vx); + const float D_y = dot(M.l.vy, M.l.vy); + const float D_z = dot(M.l.vz, M.l.vz); + + if (abs(D_x - D_y) > 1e-5f || + abs(D_x - D_z) > 1e-5f || + abs(D_y - D_z) > 1e-5f) + return false; + + if (D) *D = sqrtf(D_x); + return true; + } + + __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr) + { + Vec3fa::storeu(&ptr->l.vx, source.l.vx); + Vec3fa::storeu(&ptr->l.vy, source.l.vy); + Vec3fa::storeu(&ptr->l.vz, source.l.vz); + Vec3fa::storeu(&ptr->p, source.p); + } + + __forceinline AffineSpace3fa AffineSpace3fa_load_unaligned(AffineSpace3fa* ptr) + { + AffineSpace3fa space; + space.l.vx = Vec3fa::loadu(&ptr->l.vx); + space.l.vy = Vec3fa::loadu(&ptr->l.vy); + space.l.vz = Vec3fa::loadu(&ptr->l.vz); + space.p = Vec3fa::loadu(&ptr->p); + return space; + } + + #undef VectorT + #undef ScalarT +} diff --git a/thirdparty/embree-aarch64/common/math/bbox.h b/thirdparty/embree-aarch64/common/math/bbox.h new file mode 100644 index 00000000000..29bb13912b1 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/bbox.h @@ -0,0 +1,331 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec2.h" +#include "vec3.h" + +namespace embree +{ + namespace internal { + + template __forceinline T divideByTwo(const T& v) { return v / T(2); } + template <> __forceinline float divideByTwo(const float& v) { return v * 0.5f; } + template <> __forceinline double divideByTwo(const double& v) { return v * 0.5; } + + } // namespace internal + template + struct BBox + { + T lower, upper; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline BBox ( ) { } + template + __forceinline BBox ( const BBox& other ) : lower(other.lower), upper(other.upper) {} + __forceinline BBox& operator=( const BBox& other ) { lower = other.lower; upper = other.upper; return *this; } + + __forceinline BBox ( const T& v ) : lower(v), upper(v) {} + __forceinline BBox ( const T& lower, const T& upper ) : lower(lower), upper(upper) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Extending Bounds + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const BBox& extend(const BBox& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; } + __forceinline const BBox& extend(const T & other) { lower = min(lower,other ); upper = max(upper,other ); return *this; } + + /*! tests if box is empty */ + __forceinline bool empty() const { for (int i=0; i upper[i]) return true; return false; } + + /*! computes the size of the box */ + __forceinline T size() const { return upper - lower; } + + /*! computes the center of the box */ + __forceinline T center() const { return internal::divideByTwo(lower+upper); } + + /*! computes twice the center of the box */ + __forceinline T center2() const { return lower+upper; } + + /*! merges two boxes */ + __forceinline static const BBox merge (const BBox& a, const BBox& b) { + return BBox(min(a.lower, b.lower), max(a.upper, b.upper)); + } + + /*! enlarge box by some scaling factor */ + __forceinline BBox enlarge_by(const float a) const { + return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline BBox( EmptyTy ) : lower(pos_inf), upper(neg_inf) {} + __forceinline BBox( FullTy ) : lower(neg_inf), upper(pos_inf) {} + __forceinline BBox( FalseTy ) : lower(pos_inf), upper(neg_inf) {} + __forceinline BBox( TrueTy ) : lower(neg_inf), upper(pos_inf) {} + __forceinline BBox( NegInfTy ): lower(pos_inf), upper(neg_inf) {} + __forceinline BBox( PosInfTy ): lower(neg_inf), upper(pos_inf) {} + }; + + template<> __forceinline bool BBox::empty() const { + return lower > upper; + } + +#if defined(__SSE__) || defined(__ARM_NEON) + template<> __forceinline bool BBox::empty() const { + return !all(le_mask(lower,upper)); + } + template<> __forceinline bool BBox::empty() const { + return !all(le_mask(lower,upper)); + } +#endif + + /*! tests if box is finite */ + __forceinline bool isvalid( const BBox& v ) { + return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE))); + } + + /*! tests if box is finite and non-empty*/ + __forceinline bool isvalid_non_empty( const BBox& v ) { + return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)) & le_mask(v.lower,v.upper)); + } + + /*! tests if box has finite entries */ + __forceinline bool is_finite( const BBox& b) { + return is_finite(b.lower) && is_finite(b.upper); + } + + /*! test if point contained in box */ + __forceinline bool inside ( const BBox& b, const Vec3fa& p ) { return all(ge_mask(p,b.lower) & le_mask(p,b.upper)); } + + /*! computes the center of the box */ + template __forceinline const T center2(const BBox& box) { return box.lower + box.upper; } + template __forceinline const T center (const BBox& box) { return internal::divideByTwo(center2(box)); } + + /*! computes the volume of a bounding box */ + __forceinline float volume ( const BBox& b ) { return reduce_mul(b.size()); } + __forceinline float safeVolume( const BBox& b ) { if (b.empty()) return 0.0f; else return volume(b); } + + /*! computes the volume of a bounding box */ + __forceinline float volume( const BBox& b ) { return reduce_mul(b.size()); } + + /*! computes the surface area of a bounding box */ + template __forceinline const T area( const BBox >& b ) { const Vec2 d = b.size(); return d.x*d.y; } + + template __forceinline const T halfArea( const BBox >& b ) { return halfArea(b.size()); } + template __forceinline const T area( const BBox >& b ) { return T(2)*halfArea(b); } + + __forceinline float halfArea( const BBox& b ) { return halfArea(b.size()); } + __forceinline float area( const BBox& b ) { return 2.0f*halfArea(b); } + + __forceinline float halfArea( const BBox& b ) { return halfArea(b.size()); } + __forceinline float area( const BBox& b ) { return 2.0f*halfArea(b); } + + template __forceinline float safeArea( const BBox& b ) { if (b.empty()) return 0.0f; else return area(b); } + + template __forceinline float expectedApproxHalfArea(const BBox& box) { + return halfArea(box); + } + + /*! merges bounding boxes and points */ + template __forceinline const BBox merge( const BBox& a, const T& b ) { return BBox(min(a.lower, b ), max(a.upper, b )); } + template __forceinline const BBox merge( const T& a, const BBox& b ) { return BBox(min(a , b.lower), max(a , b.upper)); } + template __forceinline const BBox merge( const BBox& a, const BBox& b ) { return BBox(min(a.lower, b.lower), max(a.upper, b.upper)); } + + /*! Merges three boxes. */ + template __forceinline const BBox merge( const BBox& a, const BBox& b, const BBox& c ) { return merge(a,merge(b,c)); } + + /*! Merges four boxes. */ + template __forceinline BBox merge(const BBox& a, const BBox& b, const BBox& c, const BBox& d) { + return merge(merge(a,b),merge(c,d)); + } + + /*! Comparison Operators */ + template __forceinline bool operator==( const BBox& a, const BBox& b ) { return a.lower == b.lower && a.upper == b.upper; } + template __forceinline bool operator!=( const BBox& a, const BBox& b ) { return a.lower != b.lower || a.upper != b.upper; } + + /*! scaling */ + template __forceinline BBox operator *( const float& a, const BBox& b ) { return BBox(a*b.lower,a*b.upper); } + template __forceinline BBox operator *( const T& a, const BBox& b ) { return BBox(a*b.lower,a*b.upper); } + + /*! translations */ + template __forceinline BBox operator +( const BBox& a, const BBox& b ) { return BBox(a.lower+b.lower,a.upper+b.upper); } + template __forceinline BBox operator -( const BBox& a, const BBox& b ) { return BBox(a.lower-b.lower,a.upper-b.upper); } + template __forceinline BBox operator +( const BBox& a, const T & b ) { return BBox(a.lower+b ,a.upper+b ); } + template __forceinline BBox operator -( const BBox& a, const T & b ) { return BBox(a.lower-b ,a.upper-b ); } + + /*! extension */ + template __forceinline BBox enlarge(const BBox& a, const T& b) { return BBox(a.lower-b, a.upper+b); } + + /*! intersect bounding boxes */ + template __forceinline const BBox intersect( const BBox& a, const BBox& b ) { return BBox(max(a.lower, b.lower), min(a.upper, b.upper)); } + template __forceinline const BBox intersect( const BBox& a, const BBox& b, const BBox& c ) { return intersect(a,intersect(b,c)); } + template __forceinline const BBox intersect( const BBox& a, const BBox& b, const BBox& c, const BBox& d ) { return intersect(intersect(a,b),intersect(c,d)); } + + /*! subtract bounds from each other */ + template __forceinline void subtract(const BBox& a, const BBox& b, BBox& c, BBox& d) + { + c.lower = a.lower; + c.upper = min(a.upper,b.lower); + d.lower = max(a.lower,b.upper); + d.upper = a.upper; + } + + /*! tests if bounding boxes (and points) are disjoint (empty intersection) */ + template __inline bool disjoint( const BBox& a, const BBox& b ) { return intersect(a,b).empty(); } + template __inline bool disjoint( const BBox& a, const T& b ) { return disjoint(a,BBox(b)); } + template __inline bool disjoint( const T& a, const BBox& b ) { return disjoint(BBox(a),b); } + + /*! tests if bounding boxes (and points) are conjoint (non-empty intersection) */ + template __inline bool conjoint( const BBox& a, const BBox& b ) { return !intersect(a,b).empty(); } + template __inline bool conjoint( const BBox& a, const T& b ) { return conjoint(a,BBox(b)); } + template __inline bool conjoint( const T& a, const BBox& b ) { return conjoint(BBox(a),b); } + + /*! subset relation */ + template __inline bool subset( const BBox& a, const BBox& b ) + { + for ( size_t i = 0; i < T::N; i++ ) if ( a.lower[i] < b.lower[i] ) return false; + for ( size_t i = 0; i < T::N; i++ ) if ( a.upper[i] > b.upper[i] ) return false; + return true; + } + + template<> __inline bool subset( const BBox& a, const BBox& b ) { + return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); + } + + template<> __inline bool subset( const BBox& a, const BBox& b ) { + return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); + } + + /*! blending */ + template + __forceinline BBox lerp(const BBox& b0, const BBox& b1, const float t) { + return BBox(lerp(b0.lower,b1.lower,t),lerp(b0.upper,b1.upper,t)); + } + + /*! output operator */ + template __forceinline embree_ostream operator<<(embree_ostream cout, const BBox& box) { + return cout << "[" << box.lower << "; " << box.upper << "]"; + } + + /*! default template instantiations */ + typedef BBox BBox1f; + typedef BBox BBox2f; + typedef BBox BBox2fa; + typedef BBox BBox3f; + typedef BBox BBox3fa; + typedef BBox BBox3fx; + typedef BBox BBox3ff; +} + +//////////////////////////////////////////////////////////////////////////////// +/// SSE / AVX / MIC specializations +//////////////////////////////////////////////////////////////////////////////// + +#if defined (__SSE__) || defined(__ARM_NEON) +#include "../simd/sse.h" +#endif + +#if defined (__AVX__) +#include "../simd/avx.h" +#endif + +#if defined(__AVX512F__) +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template + __forceinline BBox>> transpose(const BBox3fa* bounds); + + template<> + __forceinline BBox> transpose<4>(const BBox3fa* bounds) + { + BBox> dest; + + transpose((vfloat4&)bounds[0].lower, + (vfloat4&)bounds[1].lower, + (vfloat4&)bounds[2].lower, + (vfloat4&)bounds[3].lower, + dest.lower.x, + dest.lower.y, + dest.lower.z); + + transpose((vfloat4&)bounds[0].upper, + (vfloat4&)bounds[1].upper, + (vfloat4&)bounds[2].upper, + (vfloat4&)bounds[3].upper, + dest.upper.x, + dest.upper.y, + dest.upper.z); + + return dest; + } + +#if defined(__AVX__) + template<> + __forceinline BBox> transpose<8>(const BBox3fa* bounds) + { + BBox> dest; + + transpose((vfloat4&)bounds[0].lower, + (vfloat4&)bounds[1].lower, + (vfloat4&)bounds[2].lower, + (vfloat4&)bounds[3].lower, + (vfloat4&)bounds[4].lower, + (vfloat4&)bounds[5].lower, + (vfloat4&)bounds[6].lower, + (vfloat4&)bounds[7].lower, + dest.lower.x, + dest.lower.y, + dest.lower.z); + + transpose((vfloat4&)bounds[0].upper, + (vfloat4&)bounds[1].upper, + (vfloat4&)bounds[2].upper, + (vfloat4&)bounds[3].upper, + (vfloat4&)bounds[4].upper, + (vfloat4&)bounds[5].upper, + (vfloat4&)bounds[6].upper, + (vfloat4&)bounds[7].upper, + dest.upper.x, + dest.upper.y, + dest.upper.z); + + return dest; + } +#endif + + template + __forceinline BBox3fa merge(const BBox3fa* bounds); + + template<> + __forceinline BBox3fa merge<4>(const BBox3fa* bounds) + { + const Vec3fa lower = min(min(bounds[0].lower,bounds[1].lower), + min(bounds[2].lower,bounds[3].lower)); + const Vec3fa upper = max(max(bounds[0].upper,bounds[1].upper), + max(bounds[2].upper,bounds[3].upper)); + return BBox3fa(lower,upper); + } + +#if defined(__AVX__) + template<> + __forceinline BBox3fa merge<8>(const BBox3fa* bounds) + { + const Vec3fa lower = min(min(min(bounds[0].lower,bounds[1].lower),min(bounds[2].lower,bounds[3].lower)), + min(min(bounds[4].lower,bounds[5].lower),min(bounds[6].lower,bounds[7].lower))); + const Vec3fa upper = max(max(max(bounds[0].upper,bounds[1].upper),max(bounds[2].upper,bounds[3].upper)), + max(max(bounds[4].upper,bounds[5].upper),max(bounds[6].upper,bounds[7].upper))); + return BBox3fa(lower,upper); + } +#endif +} + diff --git a/thirdparty/embree-aarch64/common/math/col3.h b/thirdparty/embree-aarch64/common/math/col3.h new file mode 100644 index 00000000000..f52015fb88e --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/col3.h @@ -0,0 +1,47 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// RGB Color Class + //////////////////////////////////////////////////////////////////////////////// + + template struct Col3 + { + T r, g, b; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col3 ( ) { } + __forceinline Col3 ( const Col3& other ) { r = other.r; g = other.g; b = other.b; } + __forceinline Col3& operator=( const Col3& other ) { r = other.r; g = other.g; b = other.b; return *this; } + + __forceinline explicit Col3 (const T& v) : r(v), g(v), b(v) {} + __forceinline Col3 (const T& r, const T& g, const T& b) : r(r), g(g), b(b) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col3 (ZeroTy) : r(zero) , g(zero) , b(zero) {} + __forceinline Col3 (OneTy) : r(one) , g(one) , b(one) {} + __forceinline Col3 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf) {} + __forceinline Col3 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf) {} + }; + + /*! output operator */ + template __forceinline embree_ostream operator<<(embree_ostream cout, const Col3& a) { + return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")"; + } + + /*! default template instantiations */ + typedef Col3 Col3uc; + typedef Col3 Col3f; +} diff --git a/thirdparty/embree-aarch64/common/math/col4.h b/thirdparty/embree-aarch64/common/math/col4.h new file mode 100644 index 00000000000..90df293f8e4 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/col4.h @@ -0,0 +1,47 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// RGBA Color Class + //////////////////////////////////////////////////////////////////////////////// + + template struct Col4 + { + T r, g, b, a; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col4 ( ) { } + __forceinline Col4 ( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; } + __forceinline Col4& operator=( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; return *this; } + + __forceinline explicit Col4 (const T& v) : r(v), g(v), b(v), a(v) {} + __forceinline Col4 (const T& r, const T& g, const T& b, const T& a) : r(r), g(g), b(b), a(a) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Col4 (ZeroTy) : r(zero) , g(zero) , b(zero) , a(zero) {} + __forceinline Col4 (OneTy) : r(one) , g(one) , b(one) , a(one) {} + __forceinline Col4 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf), a(pos_inf) {} + __forceinline Col4 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf), a(neg_inf) {} + }; + + /*! output operator */ + template __forceinline embree_ostream operator<<(embree_ostream cout, const Col4& a) { + return cout << "(" << a.r << ", " << a.g << ", " << a.b << ", " << a.a << ")"; + } + + /*! default template instantiations */ + typedef Col4 Col4uc; + typedef Col4 Col4f; +} diff --git a/thirdparty/embree-aarch64/common/math/color.h b/thirdparty/embree-aarch64/common/math/color.h new file mode 100644 index 00000000000..c3083e4fc0f --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/color.h @@ -0,0 +1,257 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "constants.h" +#include "col3.h" +#include "col4.h" + +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE RGBA Color Class + //////////////////////////////////////////////////////////////////////////////// + + struct Color4 + { + union { + __m128 m128; + struct { float r,g,b,a; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color4 () {} + __forceinline Color4 ( const __m128 a ) : m128(a) {} + + __forceinline explicit Color4 (const float v) : m128(_mm_set1_ps(v)) {} + __forceinline Color4 (const float r, const float g, const float b, const float a) : m128(_mm_set_ps(a,b,g,r)) {} + + __forceinline explicit Color4 ( const Col3uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(255.0f,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); } + __forceinline explicit Color4 ( const Col3f& other ) { m128 = _mm_set_ps(1.0f,other.b,other.g,other.r); } + __forceinline explicit Color4 ( const Col4uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(other.a,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); } + __forceinline explicit Color4 ( const Col4f& other ) { m128 = _mm_set_ps(other.a,other.b,other.g,other.r); } + + __forceinline Color4 ( const Color4& other ) : m128(other.m128) {} + __forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; } + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Set + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; } + __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; } + __forceinline void set(Col3uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (uint8_t)(s[0]); + d.g = (uint8_t)(s[1]); + d.b = (uint8_t)(s[2]); + } + __forceinline void set(Col4uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (uint8_t)(s[0]); + d.g = (uint8_t)(s[1]); + d.b = (uint8_t)(s[2]); + d.a = (uint8_t)(s[3]); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color4( ZeroTy ) : m128(_mm_set1_ps(0.0f)) {} + __forceinline Color4( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + }; + + //////////////////////////////////////////////////////////////////////////////// + /// SSE RGB Color Class + //////////////////////////////////////////////////////////////////////////////// + + struct Color + { + union { + __m128 m128; + struct { float r,g,b; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color () {} + __forceinline Color ( const __m128 a ) : m128(a) {} + + __forceinline explicit Color (const float v) : m128(_mm_set1_ps(v)) {} + __forceinline Color (const float r, const float g, const float b) : m128(_mm_set_ps(0.0f,b,g,r)) {} + + __forceinline Color ( const Color& other ) : m128(other.m128) {} + __forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; } + + __forceinline Color ( const Color4& other ) : m128(other.m128) {} + __forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; } + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Set + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; } + __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; } + __forceinline void set(Col3uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (uint8_t)(s[0]); + d.g = (uint8_t)(s[1]); + d.b = (uint8_t)(s[2]); + } + __forceinline void set(Col4uc& d) const + { + vfloat4 s = clamp(vfloat4(m128))*255.0f; + d.r = (uint8_t)(s[0]); + d.g = (uint8_t)(s[1]); + d.b = (uint8_t)(s[2]); + d.a = 255; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Color( ZeroTy ) : m128(_mm_set1_ps(0.0f)) {} + __forceinline Color( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color operator +( const Color& a ) { return a; } + __forceinline const Color operator -( const Color& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); + } + __forceinline const Color abs ( const Color& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); + } + __forceinline const Color rcp ( const Color& a ) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + __m128 reciprocal = _mm_rcp_ps(a.m128); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + return (const Color)reciprocal; +#else +#if defined(__AVX512VL__) + const Color r = _mm_rcp14_ps(a.m128); +#else + const Color r = _mm_rcp_ps(a.m128); +#endif + return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#endif //defined(__aarch64__) && defined(BUILD_IOS) + } + __forceinline const Color rsqrt( const Color& a ) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + __m128 r = _mm_rsqrt_ps(a.m128); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + return r; +#else + +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + +#endif //defined(__aarch64__) && defined(BUILD_IOS) + } + __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color operator +( const Color& a, const Color& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline const Color operator -( const Color& a, const Color& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline const Color operator *( const Color& a, const Color& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline const Color operator *( const Color& a, const float b ) { return a * Color(b); } + __forceinline const Color operator *( const float a, const Color& b ) { return Color(a) * b; } + __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); } + __forceinline const Color operator /( const Color& a, const float b ) { return a * rcp(b); } + + __forceinline const Color min( const Color& a, const Color& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline const Color max( const Color& a, const Color& b ) { return _mm_max_ps(a.m128,b.m128); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; } + __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; } + __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; } + __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; } + __forceinline const Color operator*=(Color& a, const float b ) { return a = a * b; } + __forceinline const Color operator/=(Color& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; } + __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; } + __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); } + __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } + __forceinline bool operator !=( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } + __forceinline bool operator < ( const Color& a, const Color& b ) { + if (a.r != b.r) return a.r < b.r; + if (a.g != b.g) return a.g < b.g; + if (a.b != b.b) return a.b < b.b; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const Color select( bool s, const Color& t, const Color& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f, t, mask); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Special Operators + //////////////////////////////////////////////////////////////////////////////// + + /*! computes luminance of a color */ + __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); } + + /*! output operator */ + __forceinline embree_ostream operator<<(embree_ostream cout, const Color& a) { + return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")"; + } +} diff --git a/thirdparty/embree-aarch64/common/math/constants.cpp b/thirdparty/embree-aarch64/common/math/constants.cpp new file mode 100644 index 00000000000..eeff131664b --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/constants.cpp @@ -0,0 +1,61 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#if defined(__aarch64__) +#include +#endif + +#include "constants.h" + +namespace embree +{ + TrueTy True; + FalseTy False; + ZeroTy zero; + OneTy one; + NegInfTy neg_inf; + PosInfTy inf; + PosInfTy pos_inf; + NaNTy nan; + UlpTy ulp; + PiTy pi; + OneOverPiTy one_over_pi; + TwoPiTy two_pi; + OneOverTwoPiTy one_over_two_pi; + FourPiTy four_pi; + OneOverFourPiTy one_over_four_pi; + StepTy step; + ReverseStepTy reverse_step; + EmptyTy empty; + UndefinedTy undefined; + +#if defined(__aarch64__) +const uint32x4_t movemask_mask = { 1, 2, 4, 8 }; +const uint32x4_t vzero = { 0, 0, 0, 0 }; +const uint32x4_t v0x80000000 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; +const uint32x4_t v0x7fffffff = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; +const uint32x4_t v000F = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }; +const uint32x4_t v00F0 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }; +const uint32x4_t v00FF = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; +const uint32x4_t v0F00 = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }; +const uint32x4_t v0F0F = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }; +const uint32x4_t v0FF0 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; +const uint32x4_t v0FFF = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; +const uint32x4_t vF000 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }; +const uint32x4_t vF00F = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }; +const uint32x4_t vF0F0 = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }; +const uint32x4_t vF0FF = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }; +const uint32x4_t vFF00 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }; +const uint32x4_t vFF0F = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }; +const uint32x4_t vFFF0 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }; +const uint32x4_t vFFFF = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; +const uint8x16_t v0022 = {0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11}; +const uint8x16_t v1133 = {4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15}; +const uint8x16_t v0101 = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7}; +const float32x4_t vOne = { 1.0f, 1.0f, 1.0f, 1.0f }; +const float32x4_t vmOne = { -1.0f, -1.0f, -1.0f, -1.0f }; +const float32x4_t vInf = { INFINITY, INFINITY, INFINITY, INFINITY }; +const float32x4_t vmInf = { -INFINITY, -INFINITY, -INFINITY, -INFINITY }; +#endif + +} diff --git a/thirdparty/embree-aarch64/common/math/constants.h b/thirdparty/embree-aarch64/common/math/constants.h new file mode 100644 index 00000000000..e80abec80f5 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/constants.h @@ -0,0 +1,239 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" + +#include + +#define _USE_MATH_DEFINES +#include // using cmath causes issues under Windows +#include +#include + +// Math constants may not be defined in libcxx + mingw + strict C++ standard +#if defined(__MINGW32__) + +// TODO(LTE): use constexpr +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif +#ifndef M_1_PI +#define M_1_PI 0.31830988618379067154 +#endif + +#endif // __MINGW32__ + +namespace embree +{ + static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f; + static MAYBE_UNUSED const float min_rcp_input = 1E-18f; // for abs(x) >= min_rcp_input the newton raphson rcp calculation does not fail + + /* we consider floating point numbers in that range as valid input numbers */ + static MAYBE_UNUSED float FLT_LARGE = 1.844E18f; + + struct TrueTy { + __forceinline operator bool( ) const { return true; } + }; + + extern MAYBE_UNUSED TrueTy True; + + struct FalseTy { + __forceinline operator bool( ) const { return false; } + }; + + extern MAYBE_UNUSED FalseTy False; + + struct ZeroTy + { + __forceinline operator double ( ) const { return 0; } + __forceinline operator float ( ) const { return 0; } + __forceinline operator long long( ) const { return 0; } + __forceinline operator unsigned long long( ) const { return 0; } + __forceinline operator long ( ) const { return 0; } + __forceinline operator unsigned long ( ) const { return 0; } + __forceinline operator int ( ) const { return 0; } + __forceinline operator unsigned int ( ) const { return 0; } + __forceinline operator short ( ) const { return 0; } + __forceinline operator unsigned short ( ) const { return 0; } + __forceinline operator int8_t ( ) const { return 0; } + __forceinline operator uint8_t ( ) const { return 0; } + }; + + extern MAYBE_UNUSED ZeroTy zero; + + struct OneTy + { + __forceinline operator double ( ) const { return 1; } + __forceinline operator float ( ) const { return 1; } + __forceinline operator long long( ) const { return 1; } + __forceinline operator unsigned long long( ) const { return 1; } + __forceinline operator long ( ) const { return 1; } + __forceinline operator unsigned long ( ) const { return 1; } + __forceinline operator int ( ) const { return 1; } + __forceinline operator unsigned int ( ) const { return 1; } + __forceinline operator short ( ) const { return 1; } + __forceinline operator unsigned short ( ) const { return 1; } + __forceinline operator int8_t ( ) const { return 1; } + __forceinline operator uint8_t ( ) const { return 1; } + }; + + extern MAYBE_UNUSED OneTy one; + + struct NegInfTy + { + __forceinline operator double ( ) const { return -std::numeric_limits::infinity(); } + __forceinline operator float ( ) const { return -std::numeric_limits::infinity(); } + __forceinline operator long long( ) const { return std::numeric_limits::min(); } + __forceinline operator unsigned long long( ) const { return std::numeric_limits::min(); } + __forceinline operator long ( ) const { return std::numeric_limits::min(); } + __forceinline operator unsigned long ( ) const { return std::numeric_limits::min(); } + __forceinline operator int ( ) const { return std::numeric_limits::min(); } + __forceinline operator unsigned int ( ) const { return std::numeric_limits::min(); } + __forceinline operator short ( ) const { return std::numeric_limits::min(); } + __forceinline operator unsigned short ( ) const { return std::numeric_limits::min(); } + __forceinline operator int8_t ( ) const { return std::numeric_limits::min(); } + __forceinline operator uint8_t ( ) const { return std::numeric_limits::min(); } + + }; + + extern MAYBE_UNUSED NegInfTy neg_inf; + + struct PosInfTy + { + __forceinline operator double ( ) const { return std::numeric_limits::infinity(); } + __forceinline operator float ( ) const { return std::numeric_limits::infinity(); } + __forceinline operator long long( ) const { return std::numeric_limits::max(); } + __forceinline operator unsigned long long( ) const { return std::numeric_limits::max(); } + __forceinline operator long ( ) const { return std::numeric_limits::max(); } + __forceinline operator unsigned long ( ) const { return std::numeric_limits::max(); } + __forceinline operator int ( ) const { return std::numeric_limits::max(); } + __forceinline operator unsigned int ( ) const { return std::numeric_limits::max(); } + __forceinline operator short ( ) const { return std::numeric_limits::max(); } + __forceinline operator unsigned short ( ) const { return std::numeric_limits::max(); } + __forceinline operator int8_t ( ) const { return std::numeric_limits::max(); } + __forceinline operator uint8_t ( ) const { return std::numeric_limits::max(); } + }; + + extern MAYBE_UNUSED PosInfTy inf; + extern MAYBE_UNUSED PosInfTy pos_inf; + + struct NaNTy + { + __forceinline operator double( ) const { return std::numeric_limits::quiet_NaN(); } + __forceinline operator float ( ) const { return std::numeric_limits::quiet_NaN(); } + }; + + extern MAYBE_UNUSED NaNTy nan; + + struct UlpTy + { + __forceinline operator double( ) const { return std::numeric_limits::epsilon(); } + __forceinline operator float ( ) const { return std::numeric_limits::epsilon(); } + }; + + extern MAYBE_UNUSED UlpTy ulp; + + struct PiTy + { + __forceinline operator double( ) const { return double(M_PI); } + __forceinline operator float ( ) const { return float(M_PI); } + }; + + extern MAYBE_UNUSED PiTy pi; + + struct OneOverPiTy + { + __forceinline operator double( ) const { return double(M_1_PI); } + __forceinline operator float ( ) const { return float(M_1_PI); } + }; + + extern MAYBE_UNUSED OneOverPiTy one_over_pi; + + struct TwoPiTy + { + __forceinline operator double( ) const { return double(2.0*M_PI); } + __forceinline operator float ( ) const { return float(2.0*M_PI); } + }; + + extern MAYBE_UNUSED TwoPiTy two_pi; + + struct OneOverTwoPiTy + { + __forceinline operator double( ) const { return double(0.5*M_1_PI); } + __forceinline operator float ( ) const { return float(0.5*M_1_PI); } + }; + + extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi; + + struct FourPiTy + { + __forceinline operator double( ) const { return double(4.0*M_PI); } + __forceinline operator float ( ) const { return float(4.0*M_PI); } + }; + + extern MAYBE_UNUSED FourPiTy four_pi; + + struct OneOverFourPiTy + { + __forceinline operator double( ) const { return double(0.25*M_1_PI); } + __forceinline operator float ( ) const { return float(0.25*M_1_PI); } + }; + + extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi; + + struct StepTy { + }; + + extern MAYBE_UNUSED StepTy step; + + struct ReverseStepTy { + }; + + extern MAYBE_UNUSED ReverseStepTy reverse_step; + + struct EmptyTy { + }; + + extern MAYBE_UNUSED EmptyTy empty; + + struct FullTy { + }; + + extern MAYBE_UNUSED FullTy full; + + struct UndefinedTy { + }; + + extern MAYBE_UNUSED UndefinedTy undefined; + +#if defined(__aarch64__) + extern const uint32x4_t movemask_mask; + extern const uint32x4_t vzero; + extern const uint32x4_t v0x80000000; + extern const uint32x4_t v0x7fffffff; + extern const uint32x4_t v000F; + extern const uint32x4_t v00F0; + extern const uint32x4_t v00FF; + extern const uint32x4_t v0F00; + extern const uint32x4_t v0F0F; + extern const uint32x4_t v0FF0; + extern const uint32x4_t v0FFF; + extern const uint32x4_t vF000; + extern const uint32x4_t vF00F; + extern const uint32x4_t vF0F0; + extern const uint32x4_t vF0FF; + extern const uint32x4_t vFF00; + extern const uint32x4_t vFF0F; + extern const uint32x4_t vFFF0; + extern const uint32x4_t vFFFF; + extern const uint8x16_t v0022; + extern const uint8x16_t v1133; + extern const uint8x16_t v0101; + extern const float32x4_t vOne; + extern const float32x4_t vmOne; + extern const float32x4_t vInf; + extern const float32x4_t vmInf; +#endif +} diff --git a/thirdparty/embree-aarch64/common/math/interval.h b/thirdparty/embree-aarch64/common/math/interval.h new file mode 100644 index 00000000000..f06478e881f --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/interval.h @@ -0,0 +1,161 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec2.h" +#include "vec3.h" +#include "bbox.h" + +namespace embree +{ + template + struct Interval + { + V lower, upper; + + __forceinline Interval() {} + __forceinline Interval ( const Interval& other ) { lower = other.lower; upper = other.upper; } + __forceinline Interval& operator=( const Interval& other ) { lower = other.lower; upper = other.upper; return *this; } + + __forceinline Interval(const V& a) : lower(a), upper(a) {} + __forceinline Interval(const V& lower, const V& upper) : lower(lower), upper(upper) {} + __forceinline Interval(const BBox& a) : lower(a.lower), upper(a.upper) {} + + /*! tests if box is empty */ + //__forceinline bool empty() const { return lower > upper; } + + /*! computes the size of the interval */ + __forceinline V size() const { return upper - lower; } + + __forceinline V center() const { return 0.5f*(lower+upper); } + + __forceinline const Interval& extend(const Interval& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; } + __forceinline const Interval& extend(const V & other) { lower = min(lower,other ); upper = max(upper,other ); return *this; } + + __forceinline friend Interval operator +( const Interval& a, const Interval& b ) { + return Interval(a.lower+b.lower,a.upper+b.upper); + } + + __forceinline friend Interval operator -( const Interval& a, const Interval& b ) { + return Interval(a.lower-b.upper,a.upper-b.lower); + } + + __forceinline friend Interval operator -( const Interval& a, const V& b ) { + return Interval(a.lower-b,a.upper-b); + } + + __forceinline friend Interval operator *( const Interval& a, const Interval& b ) + { + const V ll = a.lower*b.lower; + const V lu = a.lower*b.upper; + const V ul = a.upper*b.lower; + const V uu = a.upper*b.upper; + return Interval(min(ll,lu,ul,uu),max(ll,lu,ul,uu)); + } + + __forceinline friend Interval merge( const Interval& a, const Interval& b) { + return Interval(min(a.lower,b.lower),max(a.upper,b.upper)); + } + + __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c) { + return merge(merge(a,b),c); + } + + __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c, const Interval& d) { + return merge(merge(a,b),merge(c,d)); + } + + /*! intersect bounding boxes */ + __forceinline friend const Interval intersect( const Interval& a, const Interval& b ) { return Interval(max(a.lower, b.lower), min(a.upper, b.upper)); } + __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c ) { return intersect(a,intersect(b,c)); } + __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c, const Interval& d ) { return intersect(intersect(a,b),intersect(c,d)); } + + friend embree_ostream operator<<(embree_ostream cout, const Interval& a) { + return cout << "[" << a.lower << ", " << a.upper << "]"; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Interval( EmptyTy ) : lower(pos_inf), upper(neg_inf) {} + __forceinline Interval( FullTy ) : lower(neg_inf), upper(pos_inf) {} + }; + + __forceinline bool isEmpty(const Interval& v) { + return v.lower > v.upper; + } + + __forceinline vboolx isEmpty(const Interval& v) { + return v.lower > v.upper; + } + + /*! subset relation */ + template __forceinline bool subset( const Interval& a, const Interval& b ) { + return (a.lower > b.lower) && (a.upper < b.upper); + } + + template __forceinline bool subset( const Vec2>& a, const Vec2>& b ) { + return subset(a.x,b.x) && subset(a.y,b.y); + } + + template __forceinline const Vec2> intersect( const Vec2>& a, const Vec2>& b ) { + return Vec2>(intersect(a.x,b.x),intersect(a.y,b.y)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Interval select ( bool s, const Interval& t, const Interval& f ) { + return Interval(select(s,t.lower,f.lower),select(s,t.upper,f.upper)); + } + + template __forceinline Interval select ( const typename T::Bool& s, const Interval& t, const Interval& f ) { + return Interval(select(s,t.lower,f.lower),select(s,t.upper,f.upper)); + } + + __forceinline int numRoots(const Interval& p0, const Interval& p1) + { + float eps = 1E-4f; + bool neg0 = p0.lower < eps; bool pos0 = p0.upper > -eps; + bool neg1 = p1.lower < eps; bool pos1 = p1.upper > -eps; + return (neg0 && pos1) || (pos0 && neg1) || (neg0 && pos0) || (neg1 && pos1); + } + + typedef Interval Interval1f; + typedef Vec2> Interval2f; + typedef Vec3> Interval3f; + +inline void swap(float& a, float& b) { float tmp = a; a = b; b = tmp; } + +inline Interval1f shift(const Interval1f& v, float shift) { return Interval1f(v.lower + shift, v.upper + shift); } + +#define TWO_PI (2.0*M_PI) +inline Interval1f sin(Interval1f interval) +{ + if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); } + if (interval.upper > TWO_PI) { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); } + if (interval.lower < 0) { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); } + float sinLower = sin(interval.lower); + float sinUpper = sin(interval.upper); + if (sinLower > sinUpper) swap(sinLower, sinUpper); + if (interval.lower < M_PI / 2.0 && interval.upper > M_PI / 2.0) sinUpper = 1.0; + if (interval.lower < 3.0 * M_PI / 2.0 && interval.upper > 3.0 * M_PI / 2.0) sinLower = -1.0; + return Interval1f(sinLower, sinUpper); +} + +inline Interval1f cos(Interval1f interval) +{ + if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); } + if (interval.upper > TWO_PI) { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); } + if (interval.lower < 0) { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); } + float cosLower = cos(interval.lower); + float cosUpper = cos(interval.upper); + if (cosLower > cosUpper) swap(cosLower, cosUpper); + if (interval.lower < M_PI && interval.upper > M_PI) cosLower = -1.0; + return Interval1f(cosLower, cosUpper); +} +#undef TWO_PI +} diff --git a/thirdparty/embree-aarch64/common/math/lbbox.h b/thirdparty/embree-aarch64/common/math/lbbox.h new file mode 100644 index 00000000000..95df4a918d7 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/lbbox.h @@ -0,0 +1,289 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bbox.h" +#include "range.h" + +namespace embree +{ + template + __forceinline std::pair globalLinear(const std::pair& v, const BBox1f& dt) + { + const float rcp_dt_size = float(1.0f)/dt.size(); + const T g0 = lerp(v.first,v.second,-dt.lower*rcp_dt_size); + const T g1 = lerp(v.first,v.second,(1.0f-dt.lower)*rcp_dt_size); + return std::make_pair(g0,g1); + } + + template + struct LBBox + { + public: + __forceinline LBBox () {} + + template + __forceinline LBBox ( const LBBox& other ) + : bounds0(other.bounds0), bounds1(other.bounds1) {} + + __forceinline LBBox& operator= ( const LBBox& other ) { + bounds0 = other.bounds0; bounds1 = other.bounds1; return *this; + } + + __forceinline LBBox (EmptyTy) + : bounds0(EmptyTy()), bounds1(EmptyTy()) {} + + __forceinline explicit LBBox ( const BBox& bounds) + : bounds0(bounds), bounds1(bounds) { } + + __forceinline LBBox ( const BBox& bounds0, const BBox& bounds1) + : bounds0(bounds0), bounds1(bounds1) { } + + LBBox ( const avector>& bounds ) + { + assert(bounds.size()); + BBox b0 = bounds.front(); + BBox b1 = bounds.back(); + for (size_t i=1; i bt = lerp(b0,b1,f); + const T dlower = min(bounds[i].lower-bt.lower,T(zero)); + const T dupper = max(bounds[i].upper-bt.upper,T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + bounds0 = b0; + bounds1 = b1; + } + + /*! calculates the linear bounds of a primitive for the specified time range */ + template + __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range, float numTimeSegments) + { + const float lower = time_range.lower*numTimeSegments; + const float upper = time_range.upper*numTimeSegments; + const float ilowerf = floor(lower); + const float iupperf = ceil(upper); + const int ilower = (int)ilowerf; + const int iupper = (int)iupperf; + + const BBox blower0 = bounds(ilower); + const BBox bupper1 = bounds(iupper); + + if (iupper-ilower == 1) { + bounds0 = lerp(blower0, bupper1, lower-ilowerf); + bounds1 = lerp(bupper1, blower0, iupperf-upper); + return; + } + + const BBox blower1 = bounds(ilower+1); + const BBox bupper0 = bounds(iupper-1); + BBox b0 = lerp(blower0, blower1, lower-ilowerf); + BBox b1 = lerp(bupper1, bupper0, iupperf-upper); + + for (int i = ilower+1; i < iupper; i++) + { + const float f = (float(i)/numTimeSegments - time_range.lower) / time_range.size(); + const BBox bt = lerp(b0, b1, f); + const BBox bi = bounds(i); + const T dlower = min(bi.lower-bt.lower, T(zero)); + const T dupper = max(bi.upper-bt.upper, T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + + bounds0 = b0; + bounds1 = b1; + } + + /*! calculates the linear bounds of a primitive for the specified time range */ + template + __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range_in, const BBox1f& geom_time_range, float geom_time_segments) + { + /* normalize global time_range_in to local geom_time_range */ + const BBox1f time_range((time_range_in.lower-geom_time_range.lower)/geom_time_range.size(), + (time_range_in.upper-geom_time_range.lower)/geom_time_range.size()); + + const float lower = time_range.lower*geom_time_segments; + const float upper = time_range.upper*geom_time_segments; + const float ilowerf = floor(lower); + const float iupperf = ceil(upper); + const float ilowerfc = max(0.0f,ilowerf); + const float iupperfc = min(iupperf,geom_time_segments); + const int ilowerc = (int)ilowerfc; + const int iupperc = (int)iupperfc; + assert(iupperc-ilowerc > 0); + + /* this larger iteration range guarantees that we process borders of geom_time_range is (partially) inside time_range_in */ + const int ilower_iter = max(-1,(int)ilowerf); + const int iupper_iter = min((int)iupperf,(int)geom_time_segments+1); + + const BBox blower0 = bounds(ilowerc); + const BBox bupper1 = bounds(iupperc); + if (iupper_iter-ilower_iter == 1) { + bounds0 = lerp(blower0, bupper1, max(0.0f,lower-ilowerfc)); + bounds1 = lerp(bupper1, blower0, max(0.0f,iupperfc-upper)); + return; + } + + const BBox blower1 = bounds(ilowerc+1); + const BBox bupper0 = bounds(iupperc-1); + BBox b0 = lerp(blower0, blower1, max(0.0f,lower-ilowerfc)); + BBox b1 = lerp(bupper1, bupper0, max(0.0f,iupperfc-upper)); + + for (int i = ilower_iter+1; i < iupper_iter; i++) + { + const float f = (float(i)/geom_time_segments - time_range.lower) / time_range.size(); + const BBox bt = lerp(b0, b1, f); + const BBox bi = bounds(i); + const T dlower = min(bi.lower-bt.lower, T(zero)); + const T dupper = max(bi.upper-bt.upper, T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + + bounds0 = b0; + bounds1 = b1; + } + + /*! calculates the linear bounds of a primitive for the specified time range */ + template + __forceinline LBBox(const BoundsFunc& bounds, const range& time_range, int numTimeSegments) + { + const int ilower = time_range.begin(); + const int iupper = time_range.end(); + + BBox b0 = bounds(ilower); + BBox b1 = bounds(iupper); + + if (iupper-ilower == 1) + { + bounds0 = b0; + bounds1 = b1; + return; + } + + for (int i = ilower+1; i bt = lerp(b0, b1, f); + const BBox bi = bounds(i); + const T dlower = min(bi.lower-bt.lower, T(zero)); + const T dupper = max(bi.upper-bt.upper, T(zero)); + b0.lower += dlower; b1.lower += dlower; + b0.upper += dupper; b1.upper += dupper; + } + + bounds0 = b0; + bounds1 = b1; + } + + public: + + __forceinline bool empty() const { + return bounds().empty(); + } + + __forceinline BBox bounds () const { + return merge(bounds0,bounds1); + } + + __forceinline BBox interpolate( const float t ) const { + return lerp(bounds0,bounds1,t); + } + + __forceinline LBBox interpolate( const BBox1f& dt ) const { + return LBBox(interpolate(dt.lower),interpolate(dt.upper)); + } + + __forceinline void extend( const LBBox& other ) { + bounds0.extend(other.bounds0); + bounds1.extend(other.bounds1); + } + + __forceinline float expectedHalfArea() const; + + __forceinline float expectedHalfArea(const BBox1f& dt) const { + return interpolate(dt).expectedHalfArea(); + } + + __forceinline float expectedApproxHalfArea() const { + return 0.5f*(halfArea(bounds0) + halfArea(bounds1)); + } + + /* calculates bounds for [0,1] time range from bounds in dt time range */ + __forceinline LBBox global(const BBox1f& dt) const + { + const float rcp_dt_size = 1.0f/dt.size(); + const BBox b0 = interpolate(-dt.lower*rcp_dt_size); + const BBox b1 = interpolate((1.0f-dt.lower)*rcp_dt_size); + return LBBox(b0,b1); + } + + /*! Comparison Operators */ + //template friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; } + //template friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; } + friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; } + friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; } + + /*! output operator */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const LBBox& box) { + return cout << "LBBox { " << box.bounds0 << "; " << box.bounds1 << " }"; + } + + public: + BBox bounds0, bounds1; + }; + + /*! tests if box is finite */ + template + __forceinline bool isvalid( const LBBox& v ) { + return isvalid(v.bounds0) && isvalid(v.bounds1); + } + + template + __forceinline bool isvalid_non_empty( const LBBox& v ) { + return isvalid_non_empty(v.bounds0) && isvalid_non_empty(v.bounds1); + } + + template + __forceinline T expectedArea(const T& a0, const T& a1, const T& b0, const T& b1) + { + const T da = a1-a0; + const T db = b1-b0; + return a0*b0+(a0*db+da*b0)*T(0.5f) + da*db*T(1.0f/3.0f); + } + + template<> __forceinline float LBBox::expectedHalfArea() const + { + const Vec3fa d0 = bounds0.size(); + const Vec3fa d1 = bounds1.size(); + return reduce_add(expectedArea(Vec3fa(d0.x,d0.y,d0.z), + Vec3fa(d1.x,d1.y,d1.z), + Vec3fa(d0.y,d0.z,d0.x), + Vec3fa(d1.y,d1.z,d1.x))); + } + + template + __forceinline float expectedApproxHalfArea(const LBBox& box) { + return box.expectedApproxHalfArea(); + } + + template + __forceinline LBBox merge(const LBBox& a, const LBBox& b) { + return LBBox(merge(a.bounds0, b.bounds0), merge(a.bounds1, b.bounds1)); + } + + /*! subset relation */ + template __inline bool subset( const LBBox& a, const LBBox& b ) { + return subset(a.bounds0,b.bounds0) && subset(a.bounds1,b.bounds1); + } + + /*! default template instantiations */ + typedef LBBox LBBox1f; + typedef LBBox LBBox2f; + typedef LBBox LBBox3f; + typedef LBBox LBBox3fa; + typedef LBBox LBBox3fx; +} diff --git a/thirdparty/embree-aarch64/common/math/linearspace2.h b/thirdparty/embree-aarch64/common/math/linearspace2.h new file mode 100644 index 00000000000..b9a382962cf --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/linearspace2.h @@ -0,0 +1,148 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec2.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// 2D Linear Transform (2x2 Matrix) + //////////////////////////////////////////////////////////////////////////////// + + template struct LinearSpace2 + { + typedef T Vector; + typedef typename T::Scalar Scalar; + + /*! default matrix constructor */ + __forceinline LinearSpace2 ( ) {} + __forceinline LinearSpace2 ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; } + __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; } + + template __forceinline LinearSpace2( const LinearSpace2& s ) : vx(s.vx), vy(s.vy) {} + + /*! matrix construction from column vectors */ + __forceinline LinearSpace2(const Vector& vx, const Vector& vy) + : vx(vx), vy(vy) {} + + /*! matrix construction from row mayor data */ + __forceinline LinearSpace2(const Scalar& m00, const Scalar& m01, + const Scalar& m10, const Scalar& m11) + : vx(m00,m10), vy(m01,m11) {} + + /*! compute the determinant of the matrix */ + __forceinline const Scalar det() const { return vx.x*vy.y - vx.y*vy.x; } + + /*! compute adjoint matrix */ + __forceinline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); } + + /*! compute inverse matrix */ + __forceinline const LinearSpace2 inverse() const { return adjoint()/det(); } + + /*! compute transposed matrix */ + __forceinline const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); } + + /*! returns first row of matrix */ + __forceinline Vector row0() const { return Vector(vx.x,vy.x); } + + /*! returns second row of matrix */ + __forceinline Vector row1() const { return Vector(vx.y,vy.y); } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {} + __forceinline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {} + + /*! return matrix for scaling */ + static __forceinline LinearSpace2 scale(const Vector& s) { + return LinearSpace2(s.x, 0, + 0 , s.y); + } + + /*! return matrix for rotation */ + static __forceinline LinearSpace2 rotate(const Scalar& r) { + Scalar s = sin(r), c = cos(r); + return LinearSpace2(c, -s, + s, c); + } + + /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */ + LinearSpace2 orthogonal() const + { + LinearSpace2 m = *this; + + // mirrored? + Scalar mirror(one); + if (m.det() < Scalar(zero)) { + m.vx = -m.vx; + mirror = -mirror; + } + + // rotation + for (int i = 0; i < 99; i++) { + const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse()); + const LinearSpace2 d = m_next - m; + m = m_next; + // norm^2 of difference small enough? + if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8) + break; + } + + // rotation * mirror_x + return LinearSpace2(mirror*m.vx, m.vy); + } + + public: + + /*! the column vectors of the matrix */ + Vector vx,vy; + }; + + //////////////////////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline LinearSpace2 operator -( const LinearSpace2& a ) { return LinearSpace2(-a.vx,-a.vy); } + template __forceinline LinearSpace2 operator +( const LinearSpace2& a ) { return LinearSpace2(+a.vx,+a.vy); } + template __forceinline LinearSpace2 rcp ( const LinearSpace2& a ) { return a.inverse(); } + + //////////////////////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline LinearSpace2 operator +( const LinearSpace2& a, const LinearSpace2& b ) { return LinearSpace2(a.vx+b.vx,a.vy+b.vy); } + template __forceinline LinearSpace2 operator -( const LinearSpace2& a, const LinearSpace2& b ) { return LinearSpace2(a.vx-b.vx,a.vy-b.vy); } + + template __forceinline LinearSpace2 operator*(const typename T::Scalar & a, const LinearSpace2& b) { return LinearSpace2(a*b.vx, a*b.vy); } + template __forceinline T operator*(const LinearSpace2& a, const T & b) { return b.x*a.vx + b.y*a.vy; } + template __forceinline LinearSpace2 operator*(const LinearSpace2& a, const LinearSpace2& b) { return LinearSpace2(a*b.vx, a*b.vy); } + + template __forceinline LinearSpace2 operator/(const LinearSpace2& a, const typename T::Scalar & b) { return LinearSpace2(a.vx/b, a.vy/b); } + template __forceinline LinearSpace2 operator/(const LinearSpace2& a, const LinearSpace2& b) { return a * rcp(b); } + + template __forceinline LinearSpace2& operator *=( LinearSpace2& a, const LinearSpace2& b ) { return a = a * b; } + template __forceinline LinearSpace2& operator /=( LinearSpace2& a, const LinearSpace2& b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline bool operator ==( const LinearSpace2& a, const LinearSpace2& b ) { return a.vx == b.vx && a.vy == b.vy; } + template __forceinline bool operator !=( const LinearSpace2& a, const LinearSpace2& b ) { return a.vx != b.vx || a.vy != b.vy; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template static embree_ostream operator<<(embree_ostream cout, const LinearSpace2& m) { + return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}"; + } + + /*! Shortcuts for common linear spaces. */ + typedef LinearSpace2 LinearSpace2f; + typedef LinearSpace2 LinearSpace2fa; +} diff --git a/thirdparty/embree-aarch64/common/math/linearspace3.h b/thirdparty/embree-aarch64/common/math/linearspace3.h new file mode 100644 index 00000000000..12b5bb776bf --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/linearspace3.h @@ -0,0 +1,213 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec3.h" +#include "quaternion.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// 3D Linear Transform (3x3 Matrix) + //////////////////////////////////////////////////////////////////////////////// + + template struct LinearSpace3 + { + typedef T Vector; + typedef typename T::Scalar Scalar; + + /*! default matrix constructor */ + __forceinline LinearSpace3 ( ) {} + __forceinline LinearSpace3 ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; } + __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; } + + template __forceinline LinearSpace3( const LinearSpace3& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {} + + /*! matrix construction from column vectors */ + __forceinline LinearSpace3(const Vector& vx, const Vector& vy, const Vector& vz) + : vx(vx), vy(vy), vz(vz) {} + + /*! construction from quaternion */ + __forceinline LinearSpace3( const QuaternionT& q ) + : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j)) + , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i)) + , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {} + + /*! matrix construction from row mayor data */ + __forceinline LinearSpace3(const Scalar& m00, const Scalar& m01, const Scalar& m02, + const Scalar& m10, const Scalar& m11, const Scalar& m12, + const Scalar& m20, const Scalar& m21, const Scalar& m22) + : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {} + + /*! compute the determinant of the matrix */ + __forceinline const Scalar det() const { return dot(vx,cross(vy,vz)); } + + /*! compute adjoint matrix */ + __forceinline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); } + + /*! compute inverse matrix */ + __forceinline const LinearSpace3 inverse() const { return adjoint()/det(); } + + /*! compute transposed matrix */ + __forceinline const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); } + + /*! returns first row of matrix */ + __forceinline Vector row0() const { return Vector(vx.x,vy.x,vz.x); } + + /*! returns second row of matrix */ + __forceinline Vector row1() const { return Vector(vx.y,vy.y,vz.y); } + + /*! returns third row of matrix */ + __forceinline Vector row2() const { return Vector(vx.z,vy.z,vz.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {} + __forceinline LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {} + + /*! return matrix for scaling */ + static __forceinline LinearSpace3 scale(const Vector& s) { + return LinearSpace3(s.x, 0, 0, + 0 , s.y, 0, + 0 , 0, s.z); + } + + /*! return matrix for rotation around arbitrary axis */ + static __forceinline LinearSpace3 rotate(const Vector& _u, const Scalar& r) { + Vector u = normalize(_u); + Scalar s = sin(r), c = cos(r); + return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c, u.x*u.y*(1-c)-u.z*s, u.x*u.z*(1-c)+u.y*s, + u.x*u.y*(1-c)+u.z*s, u.y*u.y+(1-u.y*u.y)*c, u.y*u.z*(1-c)-u.x*s, + u.x*u.z*(1-c)-u.y*s, u.y*u.z*(1-c)+u.x*s, u.z*u.z+(1-u.z*u.z)*c); + } + + public: + + /*! the column vectors of the matrix */ + Vector vx,vy,vz; + }; + + /*! compute transposed matrix */ + template<> __forceinline const LinearSpace3 LinearSpace3::transposed() const { + vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz); + return LinearSpace3(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); + } + + template + __forceinline const LinearSpace3 transposed(const LinearSpace3& xfm) { + return xfm.transposed(); + } + + //////////////////////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline LinearSpace3 operator -( const LinearSpace3& a ) { return LinearSpace3(-a.vx,-a.vy,-a.vz); } + template __forceinline LinearSpace3 operator +( const LinearSpace3& a ) { return LinearSpace3(+a.vx,+a.vy,+a.vz); } + template __forceinline LinearSpace3 rcp ( const LinearSpace3& a ) { return a.inverse(); } + + /* constructs a coordinate frame form a normalized normal */ + template __forceinline LinearSpace3 frame(const T& N) + { + const T dx0(0,N.z,-N.y); + const T dx1(-N.z,0,N.x); + const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1)); + const T dy = normalize(cross(N,dx)); + return LinearSpace3(dx,dy,N); + } + + /* constructs a coordinate frame from a normal and approximate x-direction */ + template __forceinline LinearSpace3 frame(const T& N, const T& dxi) + { + if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel + const T dx = normalize(cross(dxi,N)); + const T dy = normalize(cross(N,dx)); + return LinearSpace3(dx,dy,N); + } + + /* clamps linear space to range -1 to +1 */ + template __forceinline LinearSpace3 clamp(const LinearSpace3& space) { + return LinearSpace3(clamp(space.vx,T(-1.0f),T(1.0f)), + clamp(space.vy,T(-1.0f),T(1.0f)), + clamp(space.vz,T(-1.0f),T(1.0f))); + } + + //////////////////////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline LinearSpace3 operator +( const LinearSpace3& a, const LinearSpace3& b ) { return LinearSpace3(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); } + template __forceinline LinearSpace3 operator -( const LinearSpace3& a, const LinearSpace3& b ) { return LinearSpace3(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); } + + template __forceinline LinearSpace3 operator*(const typename T::Scalar & a, const LinearSpace3& b) { return LinearSpace3(a*b.vx, a*b.vy, a*b.vz); } + template __forceinline T operator*(const LinearSpace3& a, const T & b) { return madd(T(b.x),a.vx,madd(T(b.y),a.vy,T(b.z)*a.vz)); } + template __forceinline LinearSpace3 operator*(const LinearSpace3& a, const LinearSpace3& b) { return LinearSpace3(a*b.vx, a*b.vy, a*b.vz); } + + template __forceinline LinearSpace3 operator/(const LinearSpace3& a, const typename T::Scalar & b) { return LinearSpace3(a.vx/b, a.vy/b, a.vz/b); } + template __forceinline LinearSpace3 operator/(const LinearSpace3& a, const LinearSpace3& b) { return a * rcp(b); } + + template __forceinline LinearSpace3& operator *=( LinearSpace3& a, const LinearSpace3& b ) { return a = a * b; } + template __forceinline LinearSpace3& operator /=( LinearSpace3& a, const LinearSpace3& b ) { return a = a / b; } + + template __forceinline T xfmPoint (const LinearSpace3& s, const T & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); } + template __forceinline T xfmVector(const LinearSpace3& s, const T & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); } + template __forceinline T xfmNormal(const LinearSpace3& s, const T & a) { return xfmVector(s.inverse().transposed(),a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline bool operator ==( const LinearSpace3& a, const LinearSpace3& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; } + template __forceinline bool operator !=( const LinearSpace3& a, const LinearSpace3& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline LinearSpace3 select ( const typename T::Scalar::Bool& s, const LinearSpace3& t, const LinearSpace3& f ) { + return LinearSpace3(select(s,t.vx,f.vx),select(s,t.vy,f.vy),select(s,t.vz,f.vz)); + } + + /*! blending */ + template + __forceinline LinearSpace3 lerp(const LinearSpace3& l0, const LinearSpace3& l1, const float t) + { + return LinearSpace3(lerp(l0.vx,l1.vx,t), + lerp(l0.vy,l1.vy,t), + lerp(l0.vz,l1.vz,t)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template static embree_ostream operator<<(embree_ostream cout, const LinearSpace3& m) { + return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}"; + } + + /*! Shortcuts for common linear spaces. */ + typedef LinearSpace3 LinearSpace3f; + typedef LinearSpace3 LinearSpace3fa; + typedef LinearSpace3 LinearSpace3fx; + typedef LinearSpace3 LinearSpace3ff; + + template using LinearSpace3vf = LinearSpace3>>; + typedef LinearSpace3>> LinearSpace3vf4; + typedef LinearSpace3>> LinearSpace3vf8; + typedef LinearSpace3>> LinearSpace3vf16; + + /*! blending */ + template + __forceinline LinearSpace3 lerp(const LinearSpace3& l0, + const LinearSpace3& l1, + const S& t) + { + return LinearSpace3(lerp(l0.vx,l1.vx,t), + lerp(l0.vy,l1.vy,t), + lerp(l0.vz,l1.vz,t)); + } + +} diff --git a/thirdparty/embree-aarch64/common/math/math.h b/thirdparty/embree-aarch64/common/math/math.h new file mode 100644 index 00000000000..6d54abd44de --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/math.h @@ -0,0 +1,451 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/intrinsics.h" +#include "constants.h" +#include + +#if defined(__ARM_NEON) +#include "SSE2NEON.h" +#if defined(NEON_AVX2_EMULATION) +#include "AVX2NEON.h" +#endif +#else +#include +#include +#include +#endif + +#if defined(__WIN32__) && !defined(__MINGW32__) +#if (__MSV_VER <= 1700) +namespace std +{ + __forceinline bool isinf ( const float x ) { return _finite(x) == 0; } + __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; } + __forceinline bool isfinite (const float x) { return _finite(x) != 0; } +} +#endif +#endif + +namespace embree +{ + __forceinline bool isvalid ( const float& v ) { + return (v > -FLT_LARGE) & (v < +FLT_LARGE); + } + + __forceinline int cast_f2i(float f) { + union { float f; int i; } v; v.f = f; return v.i; + } + + __forceinline float cast_i2f(int i) { + union { float f; int i; } v; v.i = i; return v.f; + } + + __forceinline int toInt (const float& a) { return int(a); } + __forceinline float toFloat(const int& a) { return float(a); } + +#if defined(__WIN32__) && !defined(__MINGW32__) + __forceinline bool finite ( const float x ) { return _finite(x) != 0; } +#endif + + __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; } + __forceinline float sqr ( const float x ) { return x*x; } + + __forceinline float rcp ( const float x ) + { +#if defined(__aarch64__) + // Move scalar to vector register and do rcp. + __m128 a; + a[0] = x; + float32x4_t reciprocal = vrecpeq_f32(a); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + return reciprocal[0]; +#else + + const __m128 a = _mm_set_ss(x); + +#if defined(__AVX512VL__) + const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a); +#else + const __m128 r = _mm_rcp_ss(a); +#endif + +#if defined(__AVX2__) + return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f)))); +#else + return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); +#endif + +#endif //defined(__aarch64__) + } + + __forceinline float signmsk ( const float x ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128i b; + a[0] = x; + b[0] = 0x80000000; + a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); + return a[0]; +#else + return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); +#endif + } + __forceinline float xorf( const float x, const float y ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128 b; + a[0] = x; + b[0] = y; + a = _mm_xor_ps(a, b); + return a[0]; +#else + return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y))); +#endif + } + __forceinline float andf( const float x, const unsigned y ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128i b; + a[0] = x; + b[0] = y; + a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); + return a[0]; +#else + return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y)))); +#endif + } + __forceinline float rsqrt( const float x ) + { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + a[0] = x; + __m128 value = _mm_rsqrt_ps(a); + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); + return value[0]; +#else + + const __m128 a = _mm_set_ss(x); +#if defined(__AVX512VL__) + const __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a); +#else + const __m128 r = _mm_rsqrt_ss(a); +#endif + const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), + _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); + return _mm_cvtss_f32(c); +#endif + } + +#if defined(__WIN32__) && (__MSC_VER <= 1700) && !defined(__MINGW32__) + __forceinline float nextafter(float x, float y) { if ((x0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); } + __forceinline double nextafter(double x, double y) { return _nextafter(x, y); } + __forceinline int roundf(float f) { return (int)(f + 0.5f); } +#else + __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); } + __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); } +#endif + + __forceinline float abs ( const float x ) { return ::fabsf(x); } + __forceinline float acos ( const float x ) { return ::acosf (x); } + __forceinline float asin ( const float x ) { return ::asinf (x); } + __forceinline float atan ( const float x ) { return ::atanf (x); } + __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); } + __forceinline float cos ( const float x ) { return ::cosf (x); } + __forceinline float cosh ( const float x ) { return ::coshf (x); } + __forceinline float exp ( const float x ) { return ::expf (x); } + __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); } + __forceinline float log ( const float x ) { return ::logf (x); } + __forceinline float log10( const float x ) { return ::log10f(x); } + __forceinline float pow ( const float x, const float y ) { return ::powf (x, y); } + __forceinline float sin ( const float x ) { return ::sinf (x); } + __forceinline float sinh ( const float x ) { return ::sinhf (x); } + __forceinline float sqrt ( const float x ) { return ::sqrtf (x); } + __forceinline float tan ( const float x ) { return ::tanf (x); } + __forceinline float tanh ( const float x ) { return ::tanhf (x); } + __forceinline float floor( const float x ) { return ::floorf (x); } + __forceinline float ceil ( const float x ) { return ::ceilf (x); } + __forceinline float frac ( const float x ) { return x-floor(x); } + + __forceinline double abs ( const double x ) { return ::fabs(x); } + __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; } + __forceinline double acos ( const double x ) { return ::acos (x); } + __forceinline double asin ( const double x ) { return ::asin (x); } + __forceinline double atan ( const double x ) { return ::atan (x); } + __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); } + __forceinline double cos ( const double x ) { return ::cos (x); } + __forceinline double cosh ( const double x ) { return ::cosh (x); } + __forceinline double exp ( const double x ) { return ::exp (x); } + __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); } + __forceinline double log ( const double x ) { return ::log (x); } + __forceinline double log10( const double x ) { return ::log10(x); } + __forceinline double pow ( const double x, const double y ) { return ::pow (x, y); } + __forceinline double rcp ( const double x ) { return 1.0/x; } + __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); } + __forceinline double sin ( const double x ) { return ::sin (x); } + __forceinline double sinh ( const double x ) { return ::sinh (x); } + __forceinline double sqr ( const double x ) { return x*x; } + __forceinline double sqrt ( const double x ) { return ::sqrt (x); } + __forceinline double tan ( const double x ) { return ::tan (x); } + __forceinline double tanh ( const double x ) { return ::tanh (x); } + __forceinline double floor( const double x ) { return ::floor (x); } + __forceinline double ceil ( const double x ) { return ::ceil (x); } + +#if defined(__aarch64__) + __forceinline float mini(float a, float b) { + // FP and Neon shares same vector register in arm64 + __m128 x; + __m128 y; + x[0] = a; + y[0] = b; + x = _mm_min_ps(x, y); + return x[0]; + } +#elif defined(__SSE4_1__) + __forceinline float mini(float a, float b) { + const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); + const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); + const __m128i ci = _mm_min_epi32(ai,bi); + return _mm_cvtss_f32(_mm_castsi128_ps(ci)); + } +#endif + +#if defined(__aarch64__) + __forceinline float maxi(float a, float b) { + // FP and Neon shares same vector register in arm64 + __m128 x; + __m128 y; + x[0] = a; + y[0] = b; + x = _mm_max_ps(x, y); + return x[0]; + } +#elif defined(__SSE4_1__) + __forceinline float maxi(float a, float b) { + const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); + const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); + const __m128i ci = _mm_max_epi32(ai,bi); + return _mm_cvtss_f32(_mm_castsi128_ps(ci)); + } +#endif + + template + __forceinline T twice(const T& a) { return a+a; } + + __forceinline int min(int a, int b) { return a __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); } + template __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); } + template __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); } + + template __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); } + template __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); } + template __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); } + + __forceinline int max(int a, int b) { return a __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); } + template __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); } + template __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); } + + template __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); } + template __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); } + template __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); } + +#if defined(__MACOSX__) + __forceinline ssize_t min(ssize_t a, ssize_t b) { return a __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); } + template __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); } + + template __forceinline T deg2rad ( const T& x ) { return x * T(1.74532925199432957692e-2f); } + template __forceinline T rad2deg ( const T& x ) { return x * T(5.72957795130823208768e1f); } + template __forceinline T sin2cos ( const T& x ) { return sqrt(max(T(zero),T(one)-x*x)); } + template __forceinline T cos2sin ( const T& x ) { return sin2cos(x); } + +#if defined(__AVX2__) + __forceinline float madd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + __forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } +#elif defined (__aarch64__) && defined(__clang__) +#pragma clang fp contract(fast) + + +__forceinline float madd ( const float a, const float b, const float c) { return a*b + c; } +__forceinline float msub ( const float a, const float b, const float c) { return a*b - c; } +__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; } +__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); } + +#pragma clang fp contract(on) +#else + __forceinline float madd ( const float a, const float b, const float c) { return a*b+c; } + __forceinline float msub ( const float a, const float b, const float c) { return a*b-c; } + __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;} + __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; } +#endif + + /*! random functions */ + template T random() { return T(0); } +#if defined(_WIN32) + template<> __forceinline int random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); } + template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); } +#else + template<> __forceinline int random() { return int(rand()); } + template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); } +#endif + template<> __forceinline float random() { return rand()/float(RAND_MAX); } + template<> __forceinline double random() { return rand()/double(RAND_MAX); } + +#if _WIN32 + __forceinline double drand48() { + return double(rand())/double(RAND_MAX); + } + + __forceinline void srand48(long seed) { + return srand(seed); + } +#endif + + /*! selects */ + __forceinline bool select(bool s, bool t , bool f) { return s ? t : f; } + __forceinline int select(bool s, int t, int f) { return s ? t : f; } + __forceinline float select(bool s, float t, float f) { return s ? t : f; } + + __forceinline bool all(bool s) { return s; } + + __forceinline float lerp(const float v0, const float v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + template + __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) { + return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3))); + } + + /*! exchange */ + template __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; } + + + template __forceinline T prod_diff(const T& a,const T& b,const T& c,const T& d) { +#if 1//!defined(__aarch64__) + return msub(a,b,c*d); +#else + return nmadd(c,d,a*b); +#endif + } + + /*! bit reverse operation */ + template + __forceinline T bitReverse(const T& vin) + { + T v = vin; + v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); + v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); + v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); + v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); + v = ( v >> 16 ) | ( v << 16); + return v; + } + + /*! bit interleave operation */ + template + __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin) + { + T x = xin, y = yin, z = zin; + x = (x | (x << 16)) & 0x030000FF; + x = (x | (x << 8)) & 0x0300F00F; + x = (x | (x << 4)) & 0x030C30C3; + x = (x | (x << 2)) & 0x09249249; + + y = (y | (y << 16)) & 0x030000FF; + y = (y | (y << 8)) & 0x0300F00F; + y = (y | (y << 4)) & 0x030C30C3; + y = (y | (y << 2)) & 0x09249249; + + z = (z | (z << 16)) & 0x030000FF; + z = (z | (z << 8)) & 0x0300F00F; + z = (z | (z << 4)) & 0x030C30C3; + z = (z | (z << 2)) & 0x09249249; + + return x | (y << 1) | (z << 2); + } + +#if defined(__AVX2__) && !defined(__aarch64__) + + template<> + __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi) + { + const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ ); + const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */); + const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */); + return xx | yy | zz; + } + +#endif + + /*! bit interleave operation for 64bit data types*/ + template + __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){ + T x = xin & 0x1fffff; + T y = yin & 0x1fffff; + T z = zin & 0x1fffff; + + x = (x | x << 32) & 0x1f00000000ffff; + x = (x | x << 16) & 0x1f0000ff0000ff; + x = (x | x << 8) & 0x100f00f00f00f00f; + x = (x | x << 4) & 0x10c30c30c30c30c3; + x = (x | x << 2) & 0x1249249249249249; + + y = (y | y << 32) & 0x1f00000000ffff; + y = (y | y << 16) & 0x1f0000ff0000ff; + y = (y | y << 8) & 0x100f00f00f00f00f; + y = (y | y << 4) & 0x10c30c30c30c30c3; + y = (y | y << 2) & 0x1249249249249249; + + z = (z | z << 32) & 0x1f00000000ffff; + z = (z | z << 16) & 0x1f0000ff0000ff; + z = (z | z << 8) & 0x100f00f00f00f00f; + z = (z | z << 4) & 0x10c30c30c30c30c3; + z = (z | z << 2) & 0x1249249249249249; + + return x | (y << 1) | (z << 2); + } +} diff --git a/thirdparty/embree-aarch64/common/math/obbox.h b/thirdparty/embree-aarch64/common/math/obbox.h new file mode 100644 index 00000000000..032b56904ee --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/obbox.h @@ -0,0 +1,39 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bbox.h" +#include "linearspace3.h" + +namespace embree +{ + /*! Oriented bounding box */ + template + struct OBBox + { + public: + + __forceinline OBBox () {} + + __forceinline OBBox (EmptyTy) + : space(one), bounds(empty) {} + + __forceinline OBBox (const BBox& bounds) + : space(one), bounds(bounds) {} + + __forceinline OBBox (const LinearSpace3& space, const BBox& bounds) + : space(space), bounds(bounds) {} + + friend embree_ostream operator<<(embree_ostream cout, const OBBox& p) { + return cout << "{ space = " << p.space << ", bounds = " << p.bounds << "}"; + } + + public: + LinearSpace3 space; //!< orthonormal transformation + BBox bounds; //!< bounds in transformed space + }; + + typedef OBBox OBBox3f; + typedef OBBox OBBox3fa; +} diff --git a/thirdparty/embree-aarch64/common/math/quaternion.h b/thirdparty/embree-aarch64/common/math/quaternion.h new file mode 100644 index 00000000000..20c69bc62f4 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/quaternion.h @@ -0,0 +1,254 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "vec3.h" +#include "vec4.h" + +#include "transcendental.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////// + // Quaternion Struct + //////////////////////////////////////////////////////////////// + + template + struct QuaternionT + { + typedef Vec3 Vector; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline QuaternionT () { } + __forceinline QuaternionT ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; } + __forceinline QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; } + + __forceinline QuaternionT( const T& r ) : r(r), i(zero), j(zero), k(zero) {} + __forceinline explicit QuaternionT( const Vec3& v ) : r(zero), i(v.x), j(v.y), k(v.z) {} + __forceinline explicit QuaternionT( const Vec4& v ) : r(v.x), i(v.y), j(v.z), k(v.w) {} + __forceinline QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {} + __forceinline QuaternionT( const T& r, const Vec3& v ) : r(r), i(v.x), j(v.y), k(v.z) {} + + __inline QuaternionT( const Vec3& vx, const Vec3& vy, const Vec3& vz ); + __inline QuaternionT( const T& yaw, const T& pitch, const T& roll ); + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {} + __forceinline QuaternionT( OneTy ) : r( one), i(zero), j(zero), k(zero) {} + + /*! return quaternion for rotation around arbitrary axis */ + static __forceinline QuaternionT rotate(const Vec3& u, const T& r) { + return QuaternionT(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u)); + } + + /*! returns the rotation axis of the quaternion as a vector */ + __forceinline Vec3 v( ) const { return Vec3(i, j, k); } + + public: + T r, i, j, k; + }; + + template __forceinline QuaternionT operator *( const T & a, const QuaternionT& b ) { return QuaternionT(a * b.r, a * b.i, a * b.j, a * b.k); } + template __forceinline QuaternionT operator *( const QuaternionT& a, const T & b ) { return QuaternionT(a.r * b, a.i * b, a.j * b, a.k * b); } + + //////////////////////////////////////////////////////////////// + // Unary Operators + //////////////////////////////////////////////////////////////// + + template __forceinline QuaternionT operator +( const QuaternionT& a ) { return QuaternionT(+a.r, +a.i, +a.j, +a.k); } + template __forceinline QuaternionT operator -( const QuaternionT& a ) { return QuaternionT(-a.r, -a.i, -a.j, -a.k); } + template __forceinline QuaternionT conj ( const QuaternionT& a ) { return QuaternionT(a.r, -a.i, -a.j, -a.k); } + template __forceinline T abs ( const QuaternionT& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } + template __forceinline QuaternionT rcp ( const QuaternionT& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } + template __forceinline QuaternionT normalize ( const QuaternionT& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); } + + // evaluates a*q-r + template __forceinline QuaternionT + msub(const T& a, const QuaternionT& q, const QuaternionT& p) + { + return QuaternionT(msub(a, q.r, p.r), + msub(a, q.i, p.i), + msub(a, q.j, p.j), + msub(a, q.k, p.k)); + } + // evaluates a*q-r + template __forceinline QuaternionT + madd (const T& a, const QuaternionT& q, const QuaternionT& p) + { + return QuaternionT(madd(a, q.r, p.r), + madd(a, q.i, p.i), + madd(a, q.j, p.j), + madd(a, q.k, p.k)); + } + + //////////////////////////////////////////////////////////////// + // Binary Operators + //////////////////////////////////////////////////////////////// + + template __forceinline QuaternionT operator +( const T & a, const QuaternionT& b ) { return QuaternionT(a + b.r, b.i, b.j, b.k); } + template __forceinline QuaternionT operator +( const QuaternionT& a, const T & b ) { return QuaternionT(a.r + b, a.i, a.j, a.k); } + template __forceinline QuaternionT operator +( const QuaternionT& a, const QuaternionT& b ) { return QuaternionT(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); } + template __forceinline QuaternionT operator -( const T & a, const QuaternionT& b ) { return QuaternionT(a - b.r, -b.i, -b.j, -b.k); } + template __forceinline QuaternionT operator -( const QuaternionT& a, const T & b ) { return QuaternionT(a.r - b, a.i, a.j, a.k); } + template __forceinline QuaternionT operator -( const QuaternionT& a, const QuaternionT& b ) { return QuaternionT(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); } + + template __forceinline Vec3 operator *( const QuaternionT& a, const Vec3 & b ) { return (a*QuaternionT(b)*conj(a)).v(); } + template __forceinline QuaternionT operator *( const QuaternionT& a, const QuaternionT& b ) { + return QuaternionT(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k, + a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j, + a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i, + a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r); + } + template __forceinline QuaternionT operator /( const T & a, const QuaternionT& b ) { return a*rcp(b); } + template __forceinline QuaternionT operator /( const QuaternionT& a, const T & b ) { return a*rcp(b); } + template __forceinline QuaternionT operator /( const QuaternionT& a, const QuaternionT& b ) { return a*rcp(b); } + + template __forceinline QuaternionT& operator +=( QuaternionT& a, const T & b ) { return a = a+b; } + template __forceinline QuaternionT& operator +=( QuaternionT& a, const QuaternionT& b ) { return a = a+b; } + template __forceinline QuaternionT& operator -=( QuaternionT& a, const T & b ) { return a = a-b; } + template __forceinline QuaternionT& operator -=( QuaternionT& a, const QuaternionT& b ) { return a = a-b; } + template __forceinline QuaternionT& operator *=( QuaternionT& a, const T & b ) { return a = a*b; } + template __forceinline QuaternionT& operator *=( QuaternionT& a, const QuaternionT& b ) { return a = a*b; } + template __forceinline QuaternionT& operator /=( QuaternionT& a, const T & b ) { return a = a*rcp(b); } + template __forceinline QuaternionT& operator /=( QuaternionT& a, const QuaternionT& b ) { return a = a*rcp(b); } + + template __forceinline QuaternionT + select(const M& m, const QuaternionT& q, const QuaternionT& p) + { + return QuaternionT(select(m, q.r, p.r), + select(m, q.i, p.i), + select(m, q.j, p.j), + select(m, q.k, p.k)); + } + + + template __forceinline Vec3 xfmPoint ( const QuaternionT& a, const Vec3& b ) { return (a*QuaternionT(b)*conj(a)).v(); } + template __forceinline Vec3 xfmVector( const QuaternionT& a, const Vec3& b ) { return (a*QuaternionT(b)*conj(a)).v(); } + template __forceinline Vec3 xfmNormal( const QuaternionT& a, const Vec3& b ) { return (a*QuaternionT(b)*conj(a)).v(); } + + template __forceinline T dot(const QuaternionT& a, const QuaternionT& b) { return a.r*b.r + a.i*b.i + a.j*b.j + a.k*b.k; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline bool operator ==( const QuaternionT& a, const QuaternionT& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; } + template __forceinline bool operator !=( const QuaternionT& a, const QuaternionT& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Orientation Functions + //////////////////////////////////////////////////////////////////////////////// + + template QuaternionT::QuaternionT( const Vec3& vx, const Vec3& vy, const Vec3& vz ) + { + if ( vx.x + vy.y + vz.z >= T(zero) ) + { + const T t = T(one) + (vx.x + vy.y + vz.z); + const T s = rsqrt(t)*T(0.5f); + r = t*s; + i = (vy.z - vz.y)*s; + j = (vz.x - vx.z)*s; + k = (vx.y - vy.x)*s; + } + else if ( vx.x >= max(vy.y, vz.z) ) + { + const T t = (T(one) + vx.x) - (vy.y + vz.z); + const T s = rsqrt(t)*T(0.5f); + r = (vy.z - vz.y)*s; + i = t*s; + j = (vx.y + vy.x)*s; + k = (vz.x + vx.z)*s; + } + else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) ) + { + const T t = (T(one) + vy.y) - (vz.z + vx.x); + const T s = rsqrt(t)*T(0.5f); + r = (vz.x - vx.z)*s; + i = (vx.y + vy.x)*s; + j = t*s; + k = (vy.z + vz.y)*s; + } + else //if ( vz.z >= max(vy.y, vx.x) ) + { + const T t = (T(one) + vz.z) - (vx.x + vy.y); + const T s = rsqrt(t)*T(0.5f); + r = (vx.y - vy.x)*s; + i = (vz.x + vx.z)*s; + j = (vy.z + vz.y)*s; + k = t*s; + } + } + + template QuaternionT::QuaternionT( const T& yaw, const T& pitch, const T& roll ) + { + const T cya = cos(yaw *T(0.5f)); + const T cpi = cos(pitch*T(0.5f)); + const T cro = cos(roll *T(0.5f)); + const T sya = sin(yaw *T(0.5f)); + const T spi = sin(pitch*T(0.5f)); + const T sro = sin(roll *T(0.5f)); + r = cro*cya*cpi + sro*sya*spi; + i = cro*cya*spi + sro*sya*cpi; + j = cro*sya*cpi - sro*cya*spi; + k = sro*cya*cpi - cro*sya*spi; + } + + ////////////////////////////////////////////////////////////////////////////// + /// Output Operators + ////////////////////////////////////////////////////////////////////////////// + + template static embree_ostream operator<<(embree_ostream cout, const QuaternionT& q) { + return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }"; + } + + /*! default template instantiations */ + typedef QuaternionT Quaternion3f; + typedef QuaternionT Quaternion3d; + + template using Quaternion3vf = QuaternionT>; + typedef QuaternionT> Quaternion3vf4; + typedef QuaternionT> Quaternion3vf8; + typedef QuaternionT> Quaternion3vf16; + + ////////////////////////////////////////////////////////////////////////////// + /// Interpolation + ////////////////////////////////////////////////////////////////////////////// + template + __forceinline QuaternionTlerp(const QuaternionT& q0, + const QuaternionT& q1, + const T& factor) + { + QuaternionT q; + q.r = lerp(q0.r, q1.r, factor); + q.i = lerp(q0.i, q1.i, factor); + q.j = lerp(q0.j, q1.j, factor); + q.k = lerp(q0.k, q1.k, factor); + return q; + } + + template + __forceinline QuaternionT slerp(const QuaternionT& q0, + const QuaternionT& q1_, + const T& t) + { + T cosTheta = dot(q0, q1_); + QuaternionT q1 = select(cosTheta < 0.f, -q1_, q1_); + cosTheta = select(cosTheta < 0.f, -cosTheta, cosTheta); + if (unlikely(all(cosTheta > 0.9995f))) { + return normalize(lerp(q0, q1, t)); + } + const T phi = t * fastapprox::acos(cosTheta); + T sinPhi, cosPhi; + fastapprox::sincos(phi, sinPhi, cosPhi); + QuaternionT qperp = sinPhi * normalize(msub(cosTheta, q0, q1)); + return msub(cosPhi, q0, qperp); + } +} diff --git a/thirdparty/embree-aarch64/common/math/range.h b/thirdparty/embree-aarch64/common/math/range.h new file mode 100644 index 00000000000..762d9cd9eab --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/range.h @@ -0,0 +1,137 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../math/math.h" + +namespace embree +{ + template + struct range + { + __forceinline range() {} + + __forceinline range(const Ty& begin) + : _begin(begin), _end(begin+1) {} + + __forceinline range(const Ty& begin, const Ty& end) + : _begin(begin), _end(end) {} + + __forceinline range(const range& other) + : _begin(other._begin), _end(other._end) {} + + template + __forceinline range(const range& other) + : _begin(Ty(other._begin)), _end(Ty(other._end)) {} + + template + __forceinline range& operator =(const range& other) { + _begin = other._begin; + _end = other._end; + return *this; + } + + __forceinline Ty begin() const { + return _begin; + } + + __forceinline Ty end() const { + return _end; + } + + __forceinline range intersect(const range& r) const { + return range (max(_begin,r._begin),min(_end,r._end)); + } + + __forceinline Ty size() const { + return _end - _begin; + } + + __forceinline bool empty() const { + return _end <= _begin; + } + + __forceinline Ty center() const { + return (_begin + _end)/2; + } + + __forceinline std::pair split() const + { + const Ty _center = center(); + return std::make_pair(range(_begin,_center),range(_center,_end)); + } + + __forceinline void split(range& left_o, range& right_o) const + { + const Ty _center = center(); + left_o = range(_begin,_center); + right_o = range(_center,_end); + } + + __forceinline friend bool operator< (const range& r0, const range& r1) { + return r0.size() < r1.size(); + } + + friend embree_ostream operator<<(embree_ostream cout, const range& r) { + return cout << "range [" << r.begin() << ", " << r.end() << "]"; + } + + Ty _begin, _end; + }; + + template + range make_range(const Ty& begin, const Ty& end) { + return range(begin,end); + } + + template + struct extended_range : public range + { + __forceinline extended_range () {} + + __forceinline extended_range (const Ty& begin) + : range(begin), _ext_end(begin+1) {} + + __forceinline extended_range (const Ty& begin, const Ty& end) + : range(begin,end), _ext_end(end) {} + + __forceinline extended_range (const Ty& begin, const Ty& end, const Ty& ext_end) + : range(begin,end), _ext_end(ext_end) {} + + __forceinline Ty ext_end() const { + return _ext_end; + } + + __forceinline Ty ext_size() const { + return _ext_end - range::_begin; + } + + __forceinline Ty ext_range_size() const { + return _ext_end - range::_end; + } + + __forceinline bool has_ext_range() const { + assert(_ext_end >= range::_end); + return (_ext_end - range::_end) > 0; + } + + __forceinline void set_ext_range(const size_t ext_end){ + assert(ext_end >= range::_end); + _ext_end = ext_end; + } + + __forceinline void move_right(const size_t plus){ + range::_begin += plus; + range::_end += plus; + _ext_end += plus; + } + + friend embree_ostream operator<<(embree_ostream cout, const extended_range& r) { + return cout << "extended_range [" << r.begin() << ", " << r.end() << " (" << r.ext_end() << ")]"; + } + + Ty _ext_end; + }; +} diff --git a/thirdparty/embree-aarch64/common/math/transcendental.h b/thirdparty/embree-aarch64/common/math/transcendental.h new file mode 100644 index 00000000000..6855d82b534 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/transcendental.h @@ -0,0 +1,525 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +// Transcendental functions from "ispc": https://github.com/ispc/ispc/ +// Most of the transcendental implementations in ispc code come from +// Solomon Boulos's "syrah": https://github.com/boulos/syrah/ + +#include "../simd/simd.h" + +namespace embree +{ + +namespace fastapprox +{ + +template +__forceinline T sin(const T &v) +{ + static const float piOverTwoVec = 1.57079637050628662109375; + static const float twoOverPiVec = 0.636619746685028076171875; + auto scaled = v * twoOverPiVec; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * piOverTwoVec; + auto kMod4 = k & 3; + auto sinUseCos = (kMod4 == 1 | kMod4 == 3); + auto flipSign = (kMod4 > 1); + + // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2, + // 4, 6, 8, 10|], [|single...|], [0;Pi/2]); + static const float sinC2 = -0.16666667163372039794921875; + static const float sinC4 = +8.333347737789154052734375e-3; + static const float sinC6 = -1.9842604524455964565277099609375e-4; + static const float sinC8 = +2.760012648650445044040679931640625e-6; + static const float sinC10 = -2.50293279435709337121807038784027099609375e-8; + + static const float cosC2 = -0.5; + static const float cosC4 = +4.166664183139801025390625e-2; + static const float cosC6 = -1.388833043165504932403564453125e-3; + static const float cosC8 = +2.47562347794882953166961669921875e-5; + static const float cosC10 = -2.59630184018533327616751194000244140625e-7; + + auto outside = select(sinUseCos, 1., x); + auto c2 = select(sinUseCos, T(cosC2), T(sinC2)); + auto c4 = select(sinUseCos, T(cosC4), T(sinC4)); + auto c6 = select(sinUseCos, T(cosC6), T(sinC6)); + auto c8 = select(sinUseCos, T(cosC8), T(sinC8)); + auto c10 = select(sinUseCos, T(cosC10), T(sinC10)); + + auto x2 = x * x; + auto formula = x2 * c10 + c8; + formula = x2 * formula + c6; + formula = x2 * formula + c4; + formula = x2 * formula + c2; + formula = x2 * formula + 1.; + formula *= outside; + + formula = select(flipSign, -formula, formula); + return formula; +} + +template +__forceinline T cos(const T &v) +{ + static const float piOverTwoVec = 1.57079637050628662109375; + static const float twoOverPiVec = 0.636619746685028076171875; + auto scaled = v * twoOverPiVec; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * piOverTwoVec; + + auto kMod4 = k & 3; + auto cosUseCos = (kMod4 == 0 | kMod4 == 2); + auto flipSign = (kMod4 == 1 | kMod4 == 2); + + const float sinC2 = -0.16666667163372039794921875; + const float sinC4 = +8.333347737789154052734375e-3; + const float sinC6 = -1.9842604524455964565277099609375e-4; + const float sinC8 = +2.760012648650445044040679931640625e-6; + const float sinC10 = -2.50293279435709337121807038784027099609375e-8; + + const float cosC2 = -0.5; + const float cosC4 = +4.166664183139801025390625e-2; + const float cosC6 = -1.388833043165504932403564453125e-3; + const float cosC8 = +2.47562347794882953166961669921875e-5; + const float cosC10 = -2.59630184018533327616751194000244140625e-7; + + auto outside = select(cosUseCos, 1., x); + auto c2 = select(cosUseCos, T(cosC2), T(sinC2)); + auto c4 = select(cosUseCos, T(cosC4), T(sinC4)); + auto c6 = select(cosUseCos, T(cosC6), T(sinC6)); + auto c8 = select(cosUseCos, T(cosC8), T(sinC8)); + auto c10 = select(cosUseCos, T(cosC10), T(sinC10)); + + auto x2 = x * x; + auto formula = x2 * c10 + c8; + formula = x2 * formula + c6; + formula = x2 * formula + c4; + formula = x2 * formula + c2; + formula = x2 * formula + 1.; + formula *= outside; + + formula = select(flipSign, -formula, formula); + return formula; +} + +template +__forceinline void sincos(const T &v, T &sinResult, T &cosResult) +{ + const float piOverTwoVec = 1.57079637050628662109375; + const float twoOverPiVec = 0.636619746685028076171875; + auto scaled = v * twoOverPiVec; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * piOverTwoVec; + auto kMod4 = k & 3; + auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2)); + auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3)); + auto sinFlipSign = (kMod4 > 1); + auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2)); + + const float oneVec = +1.; + const float sinC2 = -0.16666667163372039794921875; + const float sinC4 = +8.333347737789154052734375e-3; + const float sinC6 = -1.9842604524455964565277099609375e-4; + const float sinC8 = +2.760012648650445044040679931640625e-6; + const float sinC10 = -2.50293279435709337121807038784027099609375e-8; + + const float cosC2 = -0.5; + const float cosC4 = +4.166664183139801025390625e-2; + const float cosC6 = -1.388833043165504932403564453125e-3; + const float cosC8 = +2.47562347794882953166961669921875e-5; + const float cosC10 = -2.59630184018533327616751194000244140625e-7; + + auto x2 = x * x; + + auto sinFormula = x2 * sinC10 + sinC8; + auto cosFormula = x2 * cosC10 + cosC8; + sinFormula = x2 * sinFormula + sinC6; + cosFormula = x2 * cosFormula + cosC6; + + sinFormula = x2 * sinFormula + sinC4; + cosFormula = x2 * cosFormula + cosC4; + + sinFormula = x2 * sinFormula + sinC2; + cosFormula = x2 * cosFormula + cosC2; + + sinFormula = x2 * sinFormula + oneVec; + cosFormula = x2 * cosFormula + oneVec; + + sinFormula *= x; + + sinResult = select(sinUseCos, cosFormula, sinFormula); + cosResult = select(cosUseCos, cosFormula, sinFormula); + + sinResult = select(sinFlipSign, -sinResult, sinResult); + cosResult = select(cosFlipSign, -cosResult, cosResult); +} + +template +__forceinline T tan(const T &v) +{ + const float piOverFourVec = 0.785398185253143310546875; + const float fourOverPiVec = 1.27323949337005615234375; + + auto xLt0 = v < 0.; + auto y = select(xLt0, -v, v); + auto scaled = y * fourOverPiVec; + + auto kReal = floor(scaled); + auto k = toInt(kReal); + + auto x = y - kReal * piOverFourVec; + + // If k & 1, x -= Pi/4 + auto needOffset = (k & 1) != 0; + x = select(needOffset, x - piOverFourVec, x); + + // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To... + auto kMod4 = k & 3; + auto useCotan = (kMod4 == 1) | (kMod4 == 2); + + const float oneVec = 1.0; + + const float tanC2 = +0.33333075046539306640625; + const float tanC4 = +0.13339905440807342529296875; + const float tanC6 = +5.3348250687122344970703125e-2; + const float tanC8 = +2.46033705770969390869140625e-2; + const float tanC10 = +2.892402000725269317626953125e-3; + const float tanC12 = +9.500005282461643218994140625e-3; + + const float cotC2 = -0.3333333432674407958984375; + const float cotC4 = -2.222204394638538360595703125e-2; + const float cotC6 = -2.11752182804048061370849609375e-3; + const float cotC8 = -2.0846328698098659515380859375e-4; + const float cotC10 = -2.548247357481159269809722900390625e-5; + const float cotC12 = -3.5257363606433500535786151885986328125e-7; + + auto x2 = x * x; + T z; + if (any(useCotan)) + { + auto cotVal = x2 * cotC12 + cotC10; + cotVal = x2 * cotVal + cotC8; + cotVal = x2 * cotVal + cotC6; + cotVal = x2 * cotVal + cotC4; + cotVal = x2 * cotVal + cotC2; + cotVal = x2 * cotVal + oneVec; + // The equation is for x * cot(x) but we need -x * cot(x) for the tan part. + cotVal /= -x; + z = cotVal; + } + auto useTan = !useCotan; + if (any(useTan)) + { + auto tanVal = x2 * tanC12 + tanC10; + tanVal = x2 * tanVal + tanC8; + tanVal = x2 * tanVal + tanC6; + tanVal = x2 * tanVal + tanC4; + tanVal = x2 * tanVal + tanC2; + tanVal = x2 * tanVal + oneVec; + // Equation was for tan(x)/x + tanVal *= x; + z = select(useTan, tanVal, z); + } + return select(xLt0, -z, z); +} + +template +__forceinline T asin(const T &x0) +{ + auto isneg = (x0 < 0.f); + auto x = abs(x0); + auto isnan = (x > 1.f); + + // sollya + // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|], + // [1e-20;.9999999999999999]); + // avg error: 1.1105439e-06, max error 1.3187528e-06 + auto v = 1.57079517841339111328125f + + x * (-0.21450997889041900634765625f + + x * (8.78556668758392333984375e-2f + + x * (-4.489909112453460693359375e-2f + + x * (1.928029954433441162109375e-2f + + x * (-4.3095736764371395111083984375e-3f))))); + + v *= -sqrt(1.f - x); + v = v + 1.57079637050628662109375f; + + v = select(v < 0.f, T(0.f), v); + v = select(isneg, -v, v); + v = select(isnan, T(cast_i2f(0x7fc00000)), v); + + return v; +} + +template +__forceinline T acos(const T &v) +{ + return 1.57079637050628662109375f - asin(v); +} + +template +__forceinline T atan(const T &v) +{ + const float piOverTwoVec = 1.57079637050628662109375; + // atan(-x) = -atan(x) (so flip from negative to positive first) + // If x > 1 -> atan(x) = Pi/2 - atan(1/x) + auto xNeg = v < 0.f; + auto xFlipped = select(xNeg, -v, v); + + auto xGt1 = xFlipped > 1.; + auto x = select(xGt1, rcpSafe(xFlipped), xFlipped); + + // These coefficients approximate atan(x)/x + const float atanC0 = +0.99999988079071044921875; + const float atanC2 = -0.3333191573619842529296875; + const float atanC4 = +0.199689209461212158203125; + const float atanC6 = -0.14015688002109527587890625; + const float atanC8 = +9.905083477497100830078125e-2; + const float atanC10 = -5.93664981424808502197265625e-2; + const float atanC12 = +2.417283318936824798583984375e-2; + const float atanC14 = -4.6721356920897960662841796875e-3; + + auto x2 = x * x; + auto result = x2 * atanC14 + atanC12; + result = x2 * result + atanC10; + result = x2 * result + atanC8; + result = x2 * result + atanC6; + result = x2 * result + atanC4; + result = x2 * result + atanC2; + result = x2 * result + atanC0; + result *= x; + + result = select(xGt1, piOverTwoVec - result, result); + result = select(xNeg, -result, result); + return result; +} + +template +__forceinline T atan2(const T &y, const T &x) +{ + const float piVec = 3.1415926536; + // atan2(y, x) = + // + // atan2(y > 0, x = +-0) -> Pi/2 + // atan2(y < 0, x = +-0) -> -Pi/2 + // atan2(y = +-0, x < +0) -> +-Pi + // atan2(y = +-0, x >= +0) -> +-0 + // + // atan2(y >= 0, x < 0) -> Pi + atan(y/x) + // atan2(y < 0, x < 0) -> -Pi + atan(y/x) + // atan2(y, x > 0) -> atan(y/x) + // + // and then a bunch of code for dealing with infinities. + auto yOverX = y * rcpSafe(x); + auto atanArg = atan(yOverX); + auto xLt0 = x < 0.f; + auto yLt0 = y < 0.f; + auto offset = select(xLt0, + select(yLt0, T(-piVec), T(piVec)), 0.f); + return offset + atanArg; +} + +template +__forceinline T exp(const T &v) +{ + const float ln2Part1 = 0.6931457519; + const float ln2Part2 = 1.4286067653e-6; + const float oneOverLn2 = 1.44269502162933349609375; + + auto scaled = v * oneOverLn2; + auto kReal = floor(scaled); + auto k = toInt(kReal); + + // Reduced range version of x + auto x = v - kReal * ln2Part1; + x -= kReal * ln2Part2; + + // These coefficients are for e^x in [0, ln(2)] + const float one = 1.; + const float c2 = 0.4999999105930328369140625; + const float c3 = 0.166668415069580078125; + const float c4 = 4.16539050638675689697265625e-2; + const float c5 = 8.378830738365650177001953125e-3; + const float c6 = 1.304379315115511417388916015625e-3; + const float c7 = 2.7555381529964506626129150390625e-4; + + auto result = x * c7 + c6; + result = x * result + c5; + result = x * result + c4; + result = x * result + c3; + result = x * result + c2; + result = x * result + one; + result = x * result + one; + + // Compute 2^k (should differ for float and double, but I'll avoid + // it for now and just do floats) + const int fpbias = 127; + auto biasedN = k + fpbias; + auto overflow = kReal > fpbias; + // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0) + // we've got underflow. -127 * ln(2) -> -88.02. So the most + // negative float input that doesn't result in zero is like -88. + auto underflow = kReal <= -fpbias; + const int infBits = 0x7f800000; + biasedN <<= 23; + // Reinterpret this thing as float + auto twoToTheN = asFloat(biasedN); + // Handle both doubles and floats (hopefully eliding the copy for float) + auto elemtype2n = twoToTheN; + result *= elemtype2n; + result = select(overflow, cast_i2f(infBits), result); + result = select(underflow, 0., result); + return result; +} + +// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n +// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)). +template +__forceinline void __rangeReduceLog(const T &input, + T &reduced, + R &exponent) +{ + auto intVersion = asInt(input); + // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM + // exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000 + // 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0 + // non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111 + // = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF + + //const int exponentMask(0x7F800000) + static const int nonexponentMask = 0x807FFFFF; + + // We want the reduced version to have an exponent of -1 which is + // -1 + 127 after biasing or 126 + static const int exponentNeg1 = (126l << 23); + // NOTE(boulos): We don't need to mask anything out since we know + // the sign bit has to be 0. If it's 1, we need to return infinity/nan + // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN). + auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128] + + auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2 + exponent = offsetExponent - 127; // get the real value + + // Blend the offset_exponent with the original input (do this in + // int for now, until I decide if float can have & and ¬) + auto blended = (intVersion & nonexponentMask) | (exponentNeg1); + reduced = asFloat(blended); +} + +template struct ExponentType { }; +template struct ExponentType> { typedef vint Ty; }; +template <> struct ExponentType { typedef int Ty; }; + +template +__forceinline T log(const T &v) +{ + T reduced; + typename ExponentType::Ty exponent; + + const int nanBits = 0x7fc00000; + const int negInfBits = 0xFF800000; + const float nan = cast_i2f(nanBits); + const float negInf = cast_i2f(negInfBits); + auto useNan = v < 0.; + auto useInf = v == 0.; + auto exceptional = useNan | useInf; + const float one = 1.0; + + auto patched = select(exceptional, one, v); + __rangeReduceLog(patched, reduced, exponent); + + const float ln2 = 0.693147182464599609375; + + auto x1 = one - reduced; + const float c1 = +0.50000095367431640625; + const float c2 = +0.33326041698455810546875; + const float c3 = +0.2519190013408660888671875; + const float c4 = +0.17541764676570892333984375; + const float c5 = +0.3424419462680816650390625; + const float c6 = -0.599632322788238525390625; + const float c7 = +1.98442304134368896484375; + const float c8 = -2.4899270534515380859375; + const float c9 = +1.7491014003753662109375; + + auto result = x1 * c9 + c8; + result = x1 * result + c7; + result = x1 * result + c6; + result = x1 * result + c5; + result = x1 * result + c4; + result = x1 * result + c3; + result = x1 * result + c2; + result = x1 * result + c1; + result = x1 * result + one; + + // Equation was for -(ln(red)/(1-red)) + result *= -x1; + result += toFloat(exponent) * ln2; + + return select(exceptional, + select(useNan, T(nan), T(negInf)), + result); +} + +template +__forceinline T pow(const T &x, const T &y) +{ + auto x1 = abs(x); + auto z = exp(y * log(x1)); + + // Handle special cases + const float twoOver23 = 8388608.0f; + auto yInt = y == round(y); + auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit + + // x == 0 + z = select(x == 0.0f, + select(y < 0.0f, T(inf) | signmsk(x), + select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z); + + // x < 0 + auto xNegative = x < 0.0f; + if (any(xNegative)) + { + auto z1 = z | asFloat(yOddInt); + z1 = select(yInt, z1, std::numeric_limits::quiet_NaN()); + z = select(xNegative, z1, z); + } + + auto xFinite = isfinite(x); + auto yFinite = isfinite(y); + if (all(xFinite & yFinite)) + return z; + + // x finite and y infinite + z = select(andn(xFinite, yFinite), + select(x1 == 1.0f, 1.0f, + select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z); + + // x infinite + z = select(xFinite, z, + select(y == 0.0f, 1.0f, + select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x))); + + return z; +} + +template +__forceinline T pow(const T &x, float y) +{ + return pow(x, T(y)); +} + +} // namespace fastapprox + +} // namespace embree diff --git a/thirdparty/embree-aarch64/common/math/vec2.h b/thirdparty/embree-aarch64/common/math/vec2.h new file mode 100644 index 00000000000..a619459e9c1 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/vec2.h @@ -0,0 +1,235 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + struct Vec2fa; + + //////////////////////////////////////////////////////////////////////////////// + /// Generic 2D vector Class + //////////////////////////////////////////////////////////////////////////////// + + template struct Vec2 + { + enum { N = 2 }; + union { + struct { T x, y; }; +#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler + T components[N]; +#endif + }; + + typedef T Scalar; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2( ) {} + __forceinline explicit Vec2( const T& a ) : x(a), y(a) {} + __forceinline Vec2( const T& x, const T& y ) : x(x), y(y) {} + + __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; } + __forceinline Vec2( const Vec2fa& other ); + + template __forceinline Vec2( const Vec2& a ) : x(T(a.x)), y(T(a.y)) {} + template __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; } + + __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2( ZeroTy ) : x(zero), y(zero) {} + __forceinline Vec2( OneTy ) : x(one), y(one) {} + __forceinline Vec2( PosInfTy ) : x(pos_inf), y(pos_inf) {} + __forceinline Vec2( NegInfTy ) : x(neg_inf), y(neg_inf) {} + +#if defined(__WIN32__) && _MSC_VER == 1800 // workaround for older VS 2013 compiler + __forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return (&x)[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 2); return (&x)[axis]; } +#else + __forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return components[axis]; } + __forceinline T& operator [](const size_t axis ) { assert(axis < 2); return components[axis]; } +#endif + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec2 operator +( const Vec2& a ) { return Vec2(+a.x, +a.y); } + template __forceinline Vec2 operator -( const Vec2& a ) { return Vec2(-a.x, -a.y); } + template __forceinline Vec2 abs ( const Vec2& a ) { return Vec2(abs (a.x), abs (a.y)); } + template __forceinline Vec2 rcp ( const Vec2& a ) { return Vec2(rcp (a.x), rcp (a.y)); } + template __forceinline Vec2 rsqrt ( const Vec2& a ) { return Vec2(rsqrt(a.x), rsqrt(a.y)); } + template __forceinline Vec2 sqrt ( const Vec2& a ) { return Vec2(sqrt (a.x), sqrt (a.y)); } + template __forceinline Vec2 frac ( const Vec2& a ) { return Vec2(frac (a.x), frac (a.y)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec2 operator +( const Vec2& a, const Vec2& b ) { return Vec2(a.x + b.x, a.y + b.y); } + template __forceinline Vec2 operator +( const Vec2& a, const T& b ) { return Vec2(a.x + b , a.y + b ); } + template __forceinline Vec2 operator +( const T& a, const Vec2& b ) { return Vec2(a + b.x, a + b.y); } + template __forceinline Vec2 operator -( const Vec2& a, const Vec2& b ) { return Vec2(a.x - b.x, a.y - b.y); } + template __forceinline Vec2 operator -( const Vec2& a, const T& b ) { return Vec2(a.x - b , a.y - b ); } + template __forceinline Vec2 operator -( const T& a, const Vec2& b ) { return Vec2(a - b.x, a - b.y); } + template __forceinline Vec2 operator *( const Vec2& a, const Vec2& b ) { return Vec2(a.x * b.x, a.y * b.y); } + template __forceinline Vec2 operator *( const T& a, const Vec2& b ) { return Vec2(a * b.x, a * b.y); } + template __forceinline Vec2 operator *( const Vec2& a, const T& b ) { return Vec2(a.x * b , a.y * b ); } + template __forceinline Vec2 operator /( const Vec2& a, const Vec2& b ) { return Vec2(a.x / b.x, a.y / b.y); } + template __forceinline Vec2 operator /( const Vec2& a, const T& b ) { return Vec2(a.x / b , a.y / b ); } + template __forceinline Vec2 operator /( const T& a, const Vec2& b ) { return Vec2(a / b.x, a / b.y); } + + template __forceinline Vec2 min(const Vec2& a, const Vec2& b) { return Vec2(min(a.x, b.x), min(a.y, b.y)); } + template __forceinline Vec2 max(const Vec2& a, const Vec2& b) { return Vec2(max(a.x, b.x), max(a.y, b.y)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec2 madd ( const Vec2& a, const Vec2& b, const Vec2& c) { return Vec2( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y) ); } + template __forceinline Vec2 msub ( const Vec2& a, const Vec2& b, const Vec2& c) { return Vec2( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y) ); } + template __forceinline Vec2 nmadd ( const Vec2& a, const Vec2& b, const Vec2& c) { return Vec2(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y) ); } + template __forceinline Vec2 nmsub ( const Vec2& a, const Vec2& b, const Vec2& c) { return Vec2(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y) ); } + + template __forceinline Vec2 madd ( const T& a, const Vec2& b, const Vec2& c) { return Vec2( madd(a,b.x,c.x), madd(a,b.y,c.y) ); } + template __forceinline Vec2 msub ( const T& a, const Vec2& b, const Vec2& c) { return Vec2( msub(a,b.x,c.x), msub(a,b.y,c.y) ); } + template __forceinline Vec2 nmadd ( const T& a, const Vec2& b, const Vec2& c) { return Vec2(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y) ); } + template __forceinline Vec2 nmsub ( const T& a, const Vec2& b, const Vec2& c) { return Vec2(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y) ); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec2& operator +=( Vec2& a, const Vec2& b ) { a.x += b.x; a.y += b.y; return a; } + template __forceinline Vec2& operator -=( Vec2& a, const Vec2& b ) { a.x -= b.x; a.y -= b.y; return a; } + template __forceinline Vec2& operator *=( Vec2& a, const T& b ) { a.x *= b ; a.y *= b ; return a; } + template __forceinline Vec2& operator /=( Vec2& a, const T& b ) { a.x /= b ; a.y /= b ; return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline T reduce_add( const Vec2& a ) { return a.x + a.y; } + template __forceinline T reduce_mul( const Vec2& a ) { return a.x * a.y; } + template __forceinline T reduce_min( const Vec2& a ) { return min(a.x, a.y); } + template __forceinline T reduce_max( const Vec2& a ) { return max(a.x, a.y); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline bool operator ==( const Vec2& a, const Vec2& b ) { return a.x == b.x && a.y == b.y; } + template __forceinline bool operator !=( const Vec2& a, const Vec2& b ) { return a.x != b.x || a.y != b.y; } + template __forceinline bool operator < ( const Vec2& a, const Vec2& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Shift Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec2 shift_right_1( const Vec2& a ) { + return Vec2(shift_right_1(a.x),shift_right_1(a.y)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline T dot ( const Vec2& a, const Vec2& b ) { return madd(a.x,b.x,a.y*b.y); } + template __forceinline Vec2 cross ( const Vec2& a ) { return Vec2(-a.y,a.x); } + template __forceinline T length ( const Vec2& a ) { return sqrt(dot(a,a)); } + template __forceinline Vec2 normalize( const Vec2& a ) { return a*rsqrt(dot(a,a)); } + template __forceinline T distance ( const Vec2& a, const Vec2& b ) { return length(a-b); } + template __forceinline T det ( const Vec2& a, const Vec2& b ) { return a.x*b.y - a.y*b.x; } + + template __forceinline Vec2 normalize_safe( const Vec2& a ) { + const T d = dot(a,a); return select(d == T( zero ),a, a*rsqrt(d) ); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec2 select ( bool s, const Vec2& t, const Vec2& f ) { + return Vec2(select(s,t.x,f.x),select(s,t.y,f.y)); + } + + template __forceinline Vec2 select ( const Vec2& s, const Vec2& t, const Vec2& f ) { + return Vec2(select(s.x,t.x,f.x),select(s.y,t.y,f.y)); + } + + template __forceinline Vec2 select ( const typename T::Bool& s, const Vec2& t, const Vec2& f ) { + return Vec2(select(s,t.x,f.x),select(s,t.y,f.y)); + } + + template + __forceinline Vec2 lerp(const Vec2& v0, const Vec2& v1, const T& t) { + return madd(Vec2(T(1.0f)-t),v0,t*v1); + } + + template __forceinline int maxDim ( const Vec2& a ) + { + const Vec2 b = abs(a); + if (b.x > b.y) return 0; + else return 1; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2& a) { + return cout << "(" << a.x << ", " << a.y << ")"; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Default template instantiations + //////////////////////////////////////////////////////////////////////////////// + + typedef Vec2 Vec2b; + typedef Vec2 Vec2i; + typedef Vec2 Vec2f; +} + +#include "vec2fa.h" + +#if defined(__SSE__) || defined(__ARM_NEON) +#include "../simd/sse.h" +#endif + +#if defined(__AVX__) +#include "../simd/avx.h" +#endif + +#if defined(__AVX512F__) +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template<> __forceinline Vec2::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} + +#if defined(__SSE__) || defined(__ARM_NEON) + template<> __forceinline Vec2::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} +#endif + +#if defined(__AVX__) + template<> __forceinline Vec2::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} +#endif + +#if defined(__AVX512F__) + template<> __forceinline Vec2::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} +#endif +} diff --git a/thirdparty/embree-aarch64/common/math/vec2fa.h b/thirdparty/embree-aarch64/common/math/vec2fa.h new file mode 100644 index 00000000000..451ecd556cf --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/vec2fa.h @@ -0,0 +1,317 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec2fa Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec2fa + { + ALIGNED_STRUCT_(16); + + typedef float Scalar; + enum { N = 2 }; + union { + __m128 m128; + struct { float x,y,az,aw; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa( ) {} + __forceinline Vec2fa( const __m128 a ) : m128(a) {} + + __forceinline Vec2fa ( const Vec2& other ) { x = other.x; y = other.y; } + __forceinline Vec2fa& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; } + + __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; } + __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; } + + __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {} + __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {} + + __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline Vec2fa load( const void* const a ) { + return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); + } + + static __forceinline Vec2fa loadu( const void* const a ) { + return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); + } + + static __forceinline void storeu ( void* ptr, const Vec2fa& v ) { + _mm_storeu_ps((float*)ptr,v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; } + __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; } + __forceinline Vec2fa operator -( const Vec2fa& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); + } + __forceinline Vec2fa abs ( const Vec2fa& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); + } + __forceinline Vec2fa sign ( const Vec2fa& a ) { + return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero))); + } + + __forceinline Vec2fa rcp ( const Vec2fa& a ) + { +#if defined(__aarch64__) + __m128 reciprocal = _mm_rcp_ps(a.m128); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + return (const Vec2fa)reciprocal; +#else +#if defined(__AVX512VL__) + const Vec2fa r = _mm_rcp14_ps(a.m128); +#else + const Vec2fa r = _mm_rcp_ps(a.m128); +#endif + +#if defined(__AVX2__) + const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); +#else + const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); + //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#endif + + return res; +#endif //defined(__aarch64__) + } + + __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); } + __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a,a); } + + __forceinline Vec2fa rsqrt( const Vec2fa& a ) + { +#if defined(__aarch64__) + __m128 r = _mm_rsqrt_ps(a.m128); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + return r; +#else + +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + +#endif + } + + __forceinline Vec2fa zero_fix(const Vec2fa& a) { + return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); + } + __forceinline Vec2fa rcp_safe(const Vec2fa& a) { + return rcp(zero_fix(a)); + } + __forceinline Vec2fa log ( const Vec2fa& a ) { + return Vec2fa(logf(a.x),logf(a.y)); + } + + __forceinline Vec2fa exp ( const Vec2fa& a ) { + return Vec2fa(expf(a.x),expf(a.y)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); } + __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; } + __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); } + __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } + __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } + + __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); } + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + + __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) { + return Vec2fa(powf(a.x,b),powf(a.y,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); } + __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); } + __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); } + __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); } +#else + __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; } + __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; } + __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;} + __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; } +#endif + + __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); } + __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); } + __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); } + __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; } + __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; } + __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; } + __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; } + __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; } + __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; } + __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; } + __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); } + __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; } + __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { + return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F)); + } +#else + __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { + return reduce_add(a*b); + } +#endif + + __forceinline Vec2fa cross ( const Vec2fa& a ) { + return Vec2fa(-a.y,a.x); + } + + __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,a); } + __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(dot(a,a)); } + __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(dot(a,a)); } + __forceinline float length ( const Vec2fa& a ) { return sqrt(dot(a,a)); } + __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(dot(a,a)); } + __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f, t, mask); + } + + __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + __forceinline int maxDim ( const Vec2fa& a ) + { + const Vec2fa b = abs(a); + if (b.x > b.y) return 0; + else return 1; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) +__forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); } +__forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); } +//__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); } +#elif defined (__SSE4_1__) + //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } + __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } + __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } +#else + //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); } + __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); } + __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) { + return cout << "(" << a.x << ", " << a.y << ")"; + } + + typedef Vec2fa Vec2fa_t; +} diff --git a/thirdparty/embree-aarch64/common/math/vec3.h b/thirdparty/embree-aarch64/common/math/vec3.h new file mode 100644 index 00000000000..18703217151 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/vec3.h @@ -0,0 +1,349 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" + +namespace embree +{ + struct Vec3fa; + + //////////////////////////////////////////////////////////////////////////////// + /// Generic 3D vector Class + //////////////////////////////////////////////////////////////////////////////// + + template struct Vec3 + { + enum { N = 3 }; + + union { + struct { + T x, y, z; + }; +#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler + T components[N]; +#endif + }; + + typedef T Scalar; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3( ) {} + __forceinline explicit Vec3( const T& a ) : x(a), y(a), z(a) {} + __forceinline Vec3( const T& x, const T& y, const T& z ) : x(x), y(y), z(z) {} + + __forceinline Vec3( const Vec3& other ) { x = other.x; y = other.y; z = other.z; } + __forceinline Vec3( const Vec3fa& other ); + + template __forceinline Vec3( const Vec3& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)) {} + template __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; } + + __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3( ZeroTy ) : x(zero), y(zero), z(zero) {} + __forceinline Vec3( OneTy ) : x(one), y(one), z(one) {} + __forceinline Vec3( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf) {} + __forceinline Vec3( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf) {} + +#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler + __forceinline const T& operator []( const size_t axis ) const { assert(axis < 3); return (&x)[axis]; } + __forceinline T& operator []( const size_t axis ) { assert(axis < 3); return (&x)[axis]; } +#else + __forceinline const T& operator [](const size_t axis) const { assert(axis < 3); return components[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 3); return components[axis]; } +#endif + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec3 operator +( const Vec3& a ) { return Vec3(+a.x, +a.y, +a.z); } + template __forceinline Vec3 operator -( const Vec3& a ) { return Vec3(-a.x, -a.y, -a.z); } + template __forceinline Vec3 abs ( const Vec3& a ) { return Vec3(abs (a.x), abs (a.y), abs (a.z)); } + template __forceinline Vec3 rcp ( const Vec3& a ) { return Vec3(rcp (a.x), rcp (a.y), rcp (a.z)); } + template __forceinline Vec3 rsqrt ( const Vec3& a ) { return Vec3(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); } + template __forceinline Vec3 sqrt ( const Vec3& a ) { return Vec3(sqrt (a.x), sqrt (a.y), sqrt (a.z)); } + + template __forceinline Vec3 zero_fix( const Vec3& a ) + { + return Vec3(select(abs(a.x) __forceinline Vec3 rcp_safe(const Vec3& a) { return rcp(zero_fix(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec3 operator +( const Vec3& a, const Vec3& b ) { return Vec3(a.x + b.x, a.y + b.y, a.z + b.z); } + template __forceinline Vec3 operator -( const Vec3& a, const Vec3& b ) { return Vec3(a.x - b.x, a.y - b.y, a.z - b.z); } + template __forceinline Vec3 operator *( const Vec3& a, const Vec3& b ) { return Vec3(a.x * b.x, a.y * b.y, a.z * b.z); } + template __forceinline Vec3 operator *( const T& a, const Vec3& b ) { return Vec3(a * b.x, a * b.y, a * b.z); } + template __forceinline Vec3 operator *( const Vec3& a, const T& b ) { return Vec3(a.x * b , a.y * b , a.z * b ); } + template __forceinline Vec3 operator /( const Vec3& a, const T& b ) { return Vec3(a.x / b , a.y / b , a.z / b ); } + template __forceinline Vec3 operator /( const T& a, const Vec3& b ) { return Vec3(a / b.x, a / b.y, a / b.z); } + template __forceinline Vec3 operator /( const Vec3& a, const Vec3& b ) { return Vec3(a.x / b.x, a.y / b.y, a.z / b.z); } + + template __forceinline Vec3 min(const Vec3& a, const Vec3& b) { return Vec3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); } + template __forceinline Vec3 max(const Vec3& a, const Vec3& b) { return Vec3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); } + + template __forceinline Vec3 operator >>( const Vec3& a, const int b ) { return Vec3(a.x >> b, a.y >> b, a.z >> b); } + template __forceinline Vec3 operator <<( const Vec3& a, const int b ) { return Vec3(a.x << b, a.y << b, a.z << b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec3 madd ( const Vec3& a, const Vec3& b, const Vec3& c) { return Vec3( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); } + template __forceinline Vec3 msub ( const Vec3& a, const Vec3& b, const Vec3& c) { return Vec3( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); } + template __forceinline Vec3 nmadd ( const Vec3& a, const Vec3& b, const Vec3& c) { return Vec3(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z));} + template __forceinline Vec3 nmsub ( const Vec3& a, const Vec3& b, const Vec3& c) { return Vec3(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z)); } + + template __forceinline Vec3 madd ( const T& a, const Vec3& b, const Vec3& c) { return Vec3( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z)); } + template __forceinline Vec3 msub ( const T& a, const Vec3& b, const Vec3& c) { return Vec3( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z)); } + template __forceinline Vec3 nmadd ( const T& a, const Vec3& b, const Vec3& c) { return Vec3(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z));} + template __forceinline Vec3 nmsub ( const T& a, const Vec3& b, const Vec3& c) { return Vec3(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec3& operator +=( Vec3& a, const T b ) { a.x += b; a.y += b; a.z += b; return a; } + template __forceinline Vec3& operator +=( Vec3& a, const Vec3& b ) { a.x += b.x; a.y += b.y; a.z += b.z; return a; } + template __forceinline Vec3& operator -=( Vec3& a, const Vec3& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; } + template __forceinline Vec3& operator *=( Vec3& a, const T& b ) { a.x *= b ; a.y *= b ; a.z *= b ; return a; } + template __forceinline Vec3& operator /=( Vec3& a, const T& b ) { a.x /= b ; a.y /= b ; a.z /= b ; return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline T reduce_add( const Vec3& a ) { return a.x + a.y + a.z; } + template __forceinline T reduce_mul( const Vec3& a ) { return a.x * a.y * a.z; } + template __forceinline T reduce_min( const Vec3& a ) { return min(a.x, a.y, a.z); } + template __forceinline T reduce_max( const Vec3& a ) { return max(a.x, a.y, a.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline bool operator ==( const Vec3& a, const Vec3& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; } + template __forceinline bool operator !=( const Vec3& a, const Vec3& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; } + template __forceinline bool operator < ( const Vec3& a, const Vec3& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Shift Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec3 shift_right_1( const Vec3& a ) { + return Vec3(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec3 select ( bool s, const Vec3& t, const Vec3& f ) { + return Vec3(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z)); + } + + template __forceinline Vec3 select ( const Vec3& s, const Vec3& t, const Vec3& f ) { + return Vec3(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z)); + } + + template __forceinline Vec3 select ( const typename T::Bool& s, const Vec3& t, const Vec3& f ) { + return Vec3(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z)); + } + + template + __forceinline Vec3 lerp(const Vec3& v0, const Vec3& v1, const T& t) { + return madd(Vec3(T(1.0f)-t),v0,t*v1); + } + + template __forceinline int maxDim ( const Vec3& a ) + { + const Vec3 b = abs(a); + if (b.x > b.y) { + if (b.x > b.z) return 0; else return 2; + } else { + if (b.y > b.z) return 1; else return 2; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec3 eq_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x==b.x,a.y==b.y,a.z==b.z); } + template __forceinline Vec3 neq_mask(const Vec3& a, const Vec3& b ) { return Vec3(a.x!=b.x,a.y!=b.y,a.z!=b.z); } + template __forceinline Vec3 lt_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x< b.x,a.y< b.y,a.z< b.z); } + template __forceinline Vec3 le_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x<=b.x,a.y<=b.y,a.z<=b.z); } + template __forceinline Vec3 gt_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x> b.x,a.y> b.y,a.z> b.z); } + template __forceinline Vec3 ge_mask( const Vec3& a, const Vec3& b ) { return Vec3(a.x>=b.x,a.y>=b.y,a.z>=b.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline T sqr ( const Vec3& a ) { return dot(a,a); } + template __forceinline T dot ( const Vec3& a, const Vec3& b ) { return madd(a.x,b.x,madd(a.y,b.y,a.z*b.z)); } + template __forceinline T length ( const Vec3& a ) { return sqrt(sqr(a)); } + template __forceinline T rcp_length( const Vec3& a ) { return rsqrt(sqr(a)); } + template __forceinline Vec3 normalize( const Vec3& a ) { return a*rsqrt(sqr(a)); } + template __forceinline T distance ( const Vec3& a, const Vec3& b ) { return length(a-b); } + template __forceinline Vec3 cross ( const Vec3& a, const Vec3& b ) { return Vec3(prod_diff(a.y,b.z,a.z,b.y), prod_diff(a.z,b.x,a.x,b.z), prod_diff(a.x,b.y,a.y,b.x)); } + template __forceinline Vec3 stable_triangle_normal( const Vec3& a, const Vec3& b, const Vec3& c ) + { + const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x; + const T bc_x = b.z*c.y, bc_y = b.x*c.z, bc_z = b.y*c.x; + const Vec3 cross_ab(msub(a.y,b.z,ab_x), msub(a.z,b.x,ab_y), msub(a.x,b.y,ab_z)); + const Vec3 cross_bc(msub(b.y,c.z,bc_x), msub(b.z,c.x,bc_y), msub(b.x,c.y,bc_z)); + const auto sx = abs(ab_x) < abs(bc_x); + const auto sy = abs(ab_y) < abs(bc_y); + const auto sz = abs(ab_z) < abs(bc_z); + return Vec3(select(sx,cross_ab.x,cross_bc.x), + select(sy,cross_ab.y,cross_bc.y), + select(sz,cross_ab.z,cross_bc.z)); + } + + template __forceinline T sum ( const Vec3& a ) { return a.x+a.y+a.z; } + + template __forceinline T halfArea ( const Vec3& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } + template __forceinline T area ( const Vec3& d ) { return 2.0f*halfArea(d); } + + template __forceinline Vec3 normalize_safe( const Vec3& a ) { + const T d = dot(a,a); return select(d == T( zero ), a , a*rsqrt(d) ); + } + + template __forceinline T sqr_point_to_line_distance(const Vec3& P, const Vec3& Q0, const Vec3& Q1) + { + const Vec3 N = cross(P-Q0,Q1-Q0); + const Vec3 D = Q1-Q0; + return dot(N,N)*rcp(dot(D,D)); + } + + template __forceinline T sqr_point_to_line_distance(const Vec3& PmQ0, const Vec3& Q1mQ0) + { + const Vec3 N = cross(PmQ0,Q1mQ0); + const Vec3 D = Q1mQ0; + return dot(N,N)*rcp(dot(D,D)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } + + typedef Vec3 Vec3b; + typedef Vec3 Vec3i; + typedef Vec3 Vec3f; +} + +#include "vec3ba.h" +#include "vec3ia.h" +#include "vec3fa.h" + +//////////////////////////////////////////////////////////////////////////////// +/// SSE / AVX / MIC specializations +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE__) || defined(__ARM_NEON) +#include "../simd/sse.h" +#endif + +#if defined(__AVX__) +#include "../simd/avx.h" +#endif + +#if defined(__AVX512F__) +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template + __forceinline Vec3 broadcast(const Vec3& a, const size_t k) { + return Vec3(Out(a.x[k]), Out(a.y[k]), Out(a.z[k])); + } + + template<> __forceinline Vec3::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; } + +#if defined(__AVX__) + template<> __forceinline Vec3::Vec3(const Vec3fa& a) { + x = a.x; y = a.y; z = a.z; + } +#elif defined(__SSE__) || defined(__ARM_NEON) + template<> + __forceinline Vec3::Vec3(const Vec3fa& a) { + const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); + } +#endif + +#if defined(__SSE__) || defined(__ARM_NEON) + __forceinline Vec3 broadcast4f(const Vec3& a, const size_t k) { + return Vec3(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); + } + + template<> + __forceinline Vec3 broadcast(const Vec3& a, const size_t k) { + return Vec3(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); + } + + template + __forceinline Vec3 shuffle(const Vec3& b) { + return Vec3(shuffle(b.x), shuffle(b.y), shuffle(b.z)); + } +#endif + +#if defined(__AVX__) + template<> + __forceinline Vec3::Vec3(const Vec3fa& a) { + x = a.x; y = a.y; z = a.z; + } + __forceinline Vec3 broadcast4f(const Vec3& a, const size_t k) { + return Vec3(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); + } + __forceinline Vec3 broadcast8f(const Vec3& a, const size_t k) { + return Vec3(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); + } + __forceinline Vec3 broadcast8f(const Vec3& a, const size_t k) { + return Vec3(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); + } + + template<> + __forceinline Vec3 broadcast(const Vec3& a, const size_t k) { + return Vec3(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); + } + template<> + __forceinline Vec3 broadcast(const Vec3& a, const size_t k) { + return Vec3(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k])); + } + + template + __forceinline Vec3 shuffle(const Vec3& b) { + return Vec3(shuffle(b.x), shuffle(b.y), shuffle(b.z)); + } +#endif + +#if defined(__AVX512F__) + template<> __forceinline Vec3::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {} +#endif +} diff --git a/thirdparty/embree-aarch64/common/math/vec3ba.h b/thirdparty/embree-aarch64/common/math/vec3ba.h new file mode 100644 index 00000000000..90f31739c2d --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/vec3ba.h @@ -0,0 +1,120 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3ba Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3ba + { + ALIGNED_STRUCT_(16); + + union { + __m128 m128; + struct { int x,y,z; }; + }; + + typedef int Scalar; + enum { N = 3 }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba( ) {} + __forceinline Vec3ba( const __m128 input ) : m128(input) {} + __forceinline Vec3ba( const Vec3ba& other ) : m128(other.m128) {} + __forceinline Vec3ba& operator =(const Vec3ba& other) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3ba( bool a ) + : m128(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} + __forceinline Vec3ba( bool a, bool b, bool c) + : m128(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} + + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba( FalseTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec3ba( TrueTy ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline int& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.m128, Vec3ba(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.m128, b.m128); } + __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.m128, b.m128); } + __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.m128, b.m128); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; } + __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; } + __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) { + return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) == 7; + } + __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) { + return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) != 7; + } + __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) == 0x7; } + __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) != 0x0; } + + __forceinline bool all ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x7; } + __forceinline bool any ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) != 0x0; } + __forceinline bool none ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x0; } + + __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a) & 0x7; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) { + return cout << "(" << (a.x ? "1" : "0") << ", " << (a.y ? "1" : "0") << ", " << (a.z ? "1" : "0") << ")"; + } +} diff --git a/thirdparty/embree-aarch64/common/math/vec3fa.h b/thirdparty/embree-aarch64/common/math/vec3fa.h new file mode 100644 index 00000000000..6163cfb5966 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/vec3fa.h @@ -0,0 +1,810 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3fa Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3fa + { + ALIGNED_STRUCT_(16); + + typedef float Scalar; + enum { N = 3 }; + union { + __m128 m128; + struct { float x,y,z; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa( ) {} + __forceinline Vec3fa( const __m128 a ) : m128(a) {} + + __forceinline Vec3fa ( const Vec3& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } + //__forceinline Vec3fa& operator =( const Vec3& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } + + __forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; } + __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {} + __forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} + + __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} + + __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } + __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } + __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } + __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } + + //__forceinline operator const __m128&() const { return m128; } + //__forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline Vec3fa load( const void* const a ) { +#if defined(__aarch64__) + __m128 t = _mm_load_ps((float*)a); + t[3] = 0.0f; + return Vec3fa(t); +#else + return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); +#endif + } + + static __forceinline Vec3fa loadu( const void* const a ) { + return Vec3fa(_mm_loadu_ps((float*)a)); + } + + static __forceinline void storeu ( void* ptr, const Vec3fa& v ) { + _mm_storeu_ps((float*)ptr,v.m128); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; } + __forceinline Vec3fa operator -( const Vec3fa& a ) { +#if defined(__aarch64__) + return vnegq_f32(a.m128); +#else + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + + return _mm_xor_ps(a.m128, mask); +#endif + } + __forceinline Vec3fa abs ( const Vec3fa& a ) { +#if defined(__aarch64__) + return _mm_abs_ps(a.m128); +#else + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); +#endif + } + __forceinline Vec3fa sign ( const Vec3fa& a ) { +#if defined(__aarch64__) + Vec3fa r = blendv_ps(vOne, vmOne, _mm_cmplt_ps (a.m128,vdupq_n_f32(0.0f))); + return r; +#else + return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128)); +#endif + } + + __forceinline Vec3fa rcp ( const Vec3fa& a ) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + return vdivq_f32(vdupq_n_f32(1.0f),a.m128); +#elif defined(__aarch64__) + __m128 reciprocal = _mm_rcp_ps(a.m128); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + return (const Vec3fa)reciprocal; +#else + +#if defined(__AVX512VL__) + const Vec3fa r = _mm_rcp14_ps(a.m128); +#else + const Vec3fa r = _mm_rcp_ps(a.m128); +#endif + +#if defined(__AVX2__) + const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); +#else + const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); + //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#endif + + return res; +#endif //defined(__aarch64__) + } + + __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); } + __forceinline Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); } + + __forceinline Vec3fa rsqrt( const Vec3fa& a ) + { +#if defined(__aarch64__) + __m128 r = _mm_rsqrt_ps(a.m128); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + return r; +#else + +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#endif + } + + __forceinline Vec3fa zero_fix(const Vec3fa& a) { + return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); + } + __forceinline Vec3fa rcp_safe(const Vec3fa& a) { + return rcp(zero_fix(a)); + } + __forceinline Vec3fa log ( const Vec3fa& a ) { + return Vec3fa(logf(a.x),logf(a.y),logf(a.z)); + } + + __forceinline Vec3fa exp ( const Vec3fa& a ) { + return Vec3fa(expf(a.x),expf(a.y),expf(a.z)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); } + __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; } + __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); } + __forceinline Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } + __forceinline Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } + + __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); } + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + + __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) { + return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } +#else + +#if defined(__aarch64__) + __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { + return _mm_madd_ps(a.m128, b.m128, c.m128); //a*b+c; + } + __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { + return _mm_msub_ps(a.m128, b.m128, c.m128); //-a*b+c; + } + __forceinline Vec3fa nmsub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { + Vec3fa t = _mm_madd_ps(a.m128, b.m128, c.m128); + return -t; + } + __forceinline Vec3fa msub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { + return _mm_madd_ps(a.m128,b.m128,vnegq_f32(c.m128)); //a*b-c + } + +#else + __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; } + __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;} + __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; } + __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; } +#endif + +#endif + + __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); } + __forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); } + __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); } + __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; } + __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; } + __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; } + __forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; } + __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; } + __forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) && defined(BUILD_IOS) + __forceinline float reduce_add(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = 0.0f; + return vaddvq_f32(t); + } + + __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } + __forceinline float reduce_min(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = t[2]; + return vminvq_f32(t); + } + __forceinline float reduce_max(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = t[2]; + return vmaxvq_f32(t); + } +#else + __forceinline float reduce_add(const Vec3fa& v) { + const vfloat4 a(v.m128); + const vfloat4 b = shuffle<1>(a); + const vfloat4 c = shuffle<2>(a); + return _mm_cvtss_f32(a+b+c); + } + + __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } + __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); } + __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } + __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } + + __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } + __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } + __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); } + __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); } + #if defined(__aarch64__) + __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); } + __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); } +#else + __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); } + __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); } +#endif + + __forceinline bool isvalid ( const Vec3fa& v ) { + return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE))); + } + + __forceinline bool is_finite ( const Vec3fa& a ) { + return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX))); + } + + __forceinline bool isvalid4 ( const Vec3fa& v ) { + return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); + } + + __forceinline bool is_finite4 ( const Vec3fa& a ) { + return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { + return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); + } +#else + __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { + return reduce_add(a*b); + } +#endif + + __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b ) + { + vfloat4 a0 = vfloat4(a.m128); + vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); + vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); + vfloat4 b1 = vfloat4(b.m128); + return Vec3fa(shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1))); + } + + __forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); } + __forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(dot(a,a)); } + __forceinline float rcp_length2( const Vec3fa& a ) { return rcp(dot(a,a)); } + __forceinline float length ( const Vec3fa& a ) { return sqrt(dot(a,a)); } + __forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(dot(a,a)); } + __forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); } + __forceinline float halfArea ( const Vec3fa& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } + __forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); } + + __forceinline Vec3fa normalize_safe( const Vec3fa& a ) { + const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); + } + + /*! differentiated normalization */ + __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp) + { + const float pp = dot(p,p); + const float pdp = dot(p,dp); + return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f.m128, t.m128, mask); + } + + __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) { + return blendv_ps(f.m128, t.m128, s); + } + + __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + __forceinline int maxDim ( const Vec3fa& a ) + { + const Vec3fa b = abs(a); + if (b.x > b.y) { + if (b.x > b.z) return 0; else return 2; + } else { + if (b.y > b.z) return 1; else return 2; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) + __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); } + __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); } + __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); } +#elif defined (__SSE4_1__) + __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } + __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } + __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } +#else + __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); } + __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); } + __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } + + typedef Vec3fa Vec3fa_t; + + + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3fx Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3fx + { + ALIGNED_STRUCT_(16); + + typedef float Scalar; + enum { N = 3 }; + union { + __m128 m128; + struct { float x,y,z; union { int a; unsigned u; float w; }; }; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx( ) {} + __forceinline Vec3fx( const __m128 a ) : m128(a) {} + + __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {} + __forceinline operator Vec3fa () const { return Vec3fa(m128); } + + __forceinline explicit Vec3fx ( const Vec3& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } + //__forceinline Vec3fx& operator =( const Vec3& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } + + __forceinline Vec3fx ( const Vec3fx& other ) { m128 = other.m128; } + + __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {} + __forceinline Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} + + __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; } + __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; } + __forceinline Vec3fx( const Vec3fa& other, const float w1) { +#if defined (__aarch64__) + m128 = other.m128; m128[3] = w1; +#elif defined (__SSE4_1__) + m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4); +#else + const vint4 mask(-1,-1,-1,0); + m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1)); +#endif + } + //__forceinline Vec3fx( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly! + //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly! + __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {} + + //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} + + __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } + __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } + __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } + __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } + + //__forceinline operator const __m128&() const { return m128; } + //__forceinline operator __m128&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline Vec3fx load( const void* const a ) { + return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); + } + + static __forceinline Vec3fx loadu( const void* const a ) { + return Vec3fx(_mm_loadu_ps((float*)a)); + } + + static __forceinline void storeu ( void* ptr, const Vec3fx& v ) { + _mm_storeu_ps((float*)ptr,v.m128); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx( ZeroTy ) : m128(_mm_setzero_ps()) {} + __forceinline Vec3fx( OneTy ) : m128(_mm_set1_ps(1.0f)) {} + __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} + __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; } + __forceinline Vec3fx operator -( const Vec3fx& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return _mm_xor_ps(a.m128, mask); + } + __forceinline Vec3fx abs ( const Vec3fx& a ) { + const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return _mm_and_ps(a.m128, mask); + } + __forceinline Vec3fx sign ( const Vec3fx& a ) { + return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128)); + } + + __forceinline Vec3fx rcp ( const Vec3fx& a ) + { +#if defined(__AVX512VL__) + const Vec3fx r = _mm_rcp14_ps(a.m128); +#else + const Vec3fx r = _mm_rcp_ps(a.m128); +#endif + +#if defined(__AVX2__) + const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); +#else + const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); + //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); +#endif + + return res; + } + + __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); } + __forceinline Vec3fx sqr ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); } + + __forceinline Vec3fx rsqrt( const Vec3fx& a ) + { +#if defined(__AVX512VL__) + __m128 r = _mm_rsqrt14_ps(a.m128); +#else + __m128 r = _mm_rsqrt_ps(a.m128); +#endif + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + } + + __forceinline Vec3fx zero_fix(const Vec3fx& a) { + return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); + } + __forceinline Vec3fx rcp_safe(const Vec3fx& a) { + return rcp(zero_fix(a)); + } + __forceinline Vec3fx log ( const Vec3fx& a ) { + return Vec3fx(logf(a.x),logf(a.y),logf(a.z)); + } + + __forceinline Vec3fx exp ( const Vec3fx& a ) { + return Vec3fx(expf(a.x),expf(a.y),expf(a.z)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); } + __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); } + __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); } + __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); } + __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; } + __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); } + __forceinline Vec3fx operator /( const Vec3fx& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } + __forceinline Vec3fx operator /( const float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } + + __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); } + __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); } + +#if defined(__SSE4_1__) || defined(__aarch64__) + __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + +#if defined(__SSE4_1__) || defined(__aarch64__) + __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) { + const vint4 ai = _mm_castps_si128(a.m128); + const vint4 bi = _mm_castps_si128(b.m128); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } +#endif + + __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) { + return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } + __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } +#else + __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; } + __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; } + __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;} + __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; } +#endif + + __forceinline Vec3fx madd ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); } + __forceinline Vec3fx msub ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); } + __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); } + __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; } + __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; } + __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; } + __forceinline Vec3fx& operator *=( Vec3fx& a, const float b ) { return a = a * b; } + __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; } + __forceinline Vec3fx& operator /=( Vec3fx& a, const float b ) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float reduce_add(const Vec3fx& v) { + const vfloat4 a(v.m128); + const vfloat4 b = shuffle<1>(a); + const vfloat4 c = shuffle<2>(a); + return _mm_cvtss_f32(a+b+c); + } + + __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; } + __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); } + __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } + __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } + + __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } + __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } + __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); } + __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); } + __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } + __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } + + __forceinline bool isvalid ( const Vec3fx& v ) { + return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE))); + } + + __forceinline bool is_finite ( const Vec3fx& a ) { + return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX))); + } + + __forceinline bool isvalid4 ( const Vec3fx& v ) { + return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); + } + + __forceinline bool is_finite4 ( const Vec3fx& a ) { + return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE4_1__) + __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { + return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); + } +#else + __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { + return reduce_add(a*b); + } +#endif + + __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b ) + { + vfloat4 a0 = vfloat4(a.m128); + vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); + vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); + vfloat4 b1 = vfloat4(b.m128); + return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1))); + } + + __forceinline float sqr_length ( const Vec3fx& a ) { return dot(a,a); } + __forceinline float rcp_length ( const Vec3fx& a ) { return rsqrt(dot(a,a)); } + __forceinline float rcp_length2( const Vec3fx& a ) { return rcp(dot(a,a)); } + __forceinline float length ( const Vec3fx& a ) { return sqrt(dot(a,a)); } + __forceinline Vec3fx normalize( const Vec3fx& a ) { return a*rsqrt(dot(a,a)); } + __forceinline float distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); } + __forceinline float halfArea ( const Vec3fx& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } + __forceinline float area ( const Vec3fx& d ) { return 2.0f*halfArea(d); } + + __forceinline Vec3fx normalize_safe( const Vec3fx& a ) { + const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); + } + + /*! differentiated normalization */ + __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp) + { + const float pp = dot(p,p); + const float pdp = dot(p,dp); + return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) { + __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); + return blendv_ps(f.m128, t.m128, mask); + } + + __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) { + return blendv_ps(f.m128, t.m128, s); + } + + __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) { + return madd(1.0f-t,v0,t*v1); + } + + __forceinline int maxDim ( const Vec3fx& a ) + { + const Vec3fx b = abs(a); + if (b.x > b.y) { + if (b.x > b.z) return 0; else return 2; + } else { + if (b.y > b.z) return 1; else return 2; + } + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined (__SSE4_1__) && !defined(__aarch64__) + __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } + __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } + __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } +#else + __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); } + __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); } + __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } + + + typedef Vec3fx Vec3ff; +} diff --git a/thirdparty/embree-aarch64/common/math/vec3ia.h b/thirdparty/embree-aarch64/common/math/vec3ia.h new file mode 100644 index 00000000000..737f67fd725 --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/vec3ia.h @@ -0,0 +1,210 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/alloc.h" +#include "math.h" +#include "../simd/sse.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// SSE Vec3ia Type + //////////////////////////////////////////////////////////////////////////////// + + struct __aligned(16) Vec3ia + { + ALIGNED_STRUCT_(16); + + union { + __m128i m128; + struct { int x,y,z; }; + }; + + typedef int Scalar; + enum { N = 3 }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia( ) {} + __forceinline Vec3ia( const __m128i a ) : m128(a) {} + __forceinline Vec3ia( const Vec3ia& other ) : m128(other.m128) {} + __forceinline Vec3ia& operator =(const Vec3ia& other) { m128 = other.m128; return *this; } + + __forceinline explicit Vec3ia( const int a ) : m128(_mm_set1_epi32(a)) {} + __forceinline Vec3ia( const int x, const int y, const int z) : m128(_mm_set_epi32(z, z, y, x)) {} + __forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {} + + __forceinline operator const __m128i&() const { return m128; } + __forceinline operator __m128i&() { return m128; } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia( ZeroTy ) : m128(_mm_setzero_si128()) {} + __forceinline Vec3ia( OneTy ) : m128(_mm_set1_epi32(1)) {} + __forceinline Vec3ia( PosInfTy ) : m128(_mm_set1_epi32(pos_inf)) {} + __forceinline Vec3ia( NegInfTy ) : m128(_mm_set1_epi32(neg_inf)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } + __forceinline int& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } + }; + + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; } + __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); } +#if (defined(__aarch64__)) + __forceinline Vec3ia abs ( const Vec3ia& a ) { return vabsq_s32(a.m128); } +#elif defined(__SSSE3__) + __forceinline Vec3ia abs ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.m128, b.m128); } + __forceinline Vec3ia operator +( const Vec3ia& a, const int b ) { return a+Vec3ia(b); } + __forceinline Vec3ia operator +( const int a, const Vec3ia& b ) { return Vec3ia(a)+b; } + + __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.m128, b.m128); } + __forceinline Vec3ia operator -( const Vec3ia& a, const int b ) { return a-Vec3ia(b); } + __forceinline Vec3ia operator -( const int a, const Vec3ia& b ) { return Vec3ia(a)-b; } + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); } + __forceinline Vec3ia operator *( const Vec3ia& a, const int b ) { return a * Vec3ia(b); } + __forceinline Vec3ia operator *( const int a, const Vec3ia& b ) { return Vec3ia(a) * b; } +#endif + + __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.m128, b.m128); } + __forceinline Vec3ia operator &( const Vec3ia& a, const int b ) { return a & Vec3ia(b); } + __forceinline Vec3ia operator &( const int a, const Vec3ia& b ) { return Vec3ia(a) & b; } + + __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.m128, b.m128); } + __forceinline Vec3ia operator |( const Vec3ia& a, const int b ) { return a | Vec3ia(b); } + __forceinline Vec3ia operator |( const int a, const Vec3ia& b ) { return Vec3ia(a) | b; } + + __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.m128, b.m128); } + __forceinline Vec3ia operator ^( const Vec3ia& a, const int b ) { return a ^ Vec3ia(b); } + __forceinline Vec3ia operator ^( const int a, const Vec3ia& b ) { return Vec3ia(a) ^ b; } + +#if !defined(__ARM_NEON) + __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); } + __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); } + + __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); } + __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); } + __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; } + __forceinline Vec3ia& operator +=( Vec3ia& a, const int& b ) { return a = a + b; } + + __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; } + __forceinline Vec3ia& operator -=( Vec3ia& a, const int& b ) { return a = a - b; } + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; } + __forceinline Vec3ia& operator *=( Vec3ia& a, const int& b ) { return a = a * b; } +#endif + + __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; } + __forceinline Vec3ia& operator &=( Vec3ia& a, const int& b ) { return a = a & b; } + + __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; } + __forceinline Vec3ia& operator |=( Vec3ia& a, const int& b ) { return a = a | b; } + +#if !defined(__ARM_NEON) + __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; } + __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) + __forceinline int reduce_add(const Vec3ia& v) { + int32x4_t t = v.m128; + t[3] = 0; + return vaddvq_s32(t); + + } + __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } + __forceinline int reduce_min(const Vec3ia& v) { + int32x4_t t = (__m128i)blendv_ps((__m128)v0x7fffffff, (__m128)v.m128, (__m128)vFFF0); + return vminvq_s32(t); + + } + __forceinline int reduce_max(const Vec3ia& v) { + int32x4_t t = (__m128i)blendv_ps((__m128)v0x80000000, (__m128)v.m128, (__m128)vFFF0); + return vmaxvq_s32(t); + + } +#else + __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; } + __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } + __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); } + __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); } +#endif + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) == 7; } + __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) != 7; } + __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + return false; + } + + __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); } + __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); } + __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) { +#if defined(__aarch64__) || defined(__SSE4_1__) + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); +#endif + } + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); } + __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); } +#else + __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return select(lt_mask(a,b),a,b); } + __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return select(gt_mask(a,b),a,b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; + } +} diff --git a/thirdparty/embree-aarch64/common/math/vec4.h b/thirdparty/embree-aarch64/common/math/vec4.h new file mode 100644 index 00000000000..d16542f507f --- /dev/null +++ b/thirdparty/embree-aarch64/common/math/vec4.h @@ -0,0 +1,258 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "math.h" +#include "vec3.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// Generic 4D vector Class + //////////////////////////////////////////////////////////////////////////////// + + template struct Vec4 + { + enum { N = 4 }; + union { + struct { T x, y, z, w; }; +#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler + T components[N]; +#endif + }; + + typedef T Scalar; + + //////////////////////////////////////////////////////////////////////////////// + /// Construction + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec4( ) {} + __forceinline explicit Vec4( const T& a ) : x(a), y(a), z(a), w(a) {} + __forceinline Vec4( const T& x, const T& y, const T& z, const T& w ) : x(x), y(y), z(z), w(w) {} + __forceinline Vec4( const Vec3& xyz, const T& w ) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {} + + __forceinline Vec4( const Vec4& other ) { x = other.x; y = other.y; z = other.z; w = other.w; } + __forceinline Vec4( const Vec3fx& other ); + + template __forceinline Vec4( const Vec4& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)), w(T(a.w)) {} + template __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; } + + __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; } + + __forceinline operator Vec3 () const { return Vec3(x,y,z); } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec4( ZeroTy ) : x(zero), y(zero), z(zero), w(zero) {} + __forceinline Vec4( OneTy ) : x(one), y(one), z(one), w(one) {} + __forceinline Vec4( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf), w(pos_inf) {} + __forceinline Vec4( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf), w(neg_inf) {} + +#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler + __forceinline const T& operator [](const size_t axis) const { assert(axis < 4); return (&x)[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 4); return (&x)[axis]; } +#else + __forceinline const T& operator [](const size_t axis ) const { assert(axis < 4); return components[axis]; } + __forceinline T& operator [](const size_t axis) { assert(axis < 4); return components[axis]; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Swizzles + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Vec3 xyz() const { return Vec3(x, y, z); } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec4 operator +( const Vec4& a ) { return Vec4(+a.x, +a.y, +a.z, +a.w); } + template __forceinline Vec4 operator -( const Vec4& a ) { return Vec4(-a.x, -a.y, -a.z, -a.w); } + template __forceinline Vec4 abs ( const Vec4& a ) { return Vec4(abs (a.x), abs (a.y), abs (a.z), abs (a.w)); } + template __forceinline Vec4 rcp ( const Vec4& a ) { return Vec4(rcp (a.x), rcp (a.y), rcp (a.z), rcp (a.w)); } + template __forceinline Vec4 rsqrt ( const Vec4& a ) { return Vec4(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z), rsqrt(a.w)); } + template __forceinline Vec4 sqrt ( const Vec4& a ) { return Vec4(sqrt (a.x), sqrt (a.y), sqrt (a.z), sqrt (a.w)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec4 operator +( const Vec4& a, const Vec4& b ) { return Vec4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } + template __forceinline Vec4 operator -( const Vec4& a, const Vec4& b ) { return Vec4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } + template __forceinline Vec4 operator *( const Vec4& a, const Vec4& b ) { return Vec4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } + template __forceinline Vec4 operator *( const T& a, const Vec4& b ) { return Vec4(a * b.x, a * b.y, a * b.z, a * b.w); } + template __forceinline Vec4 operator *( const Vec4& a, const T& b ) { return Vec4(a.x * b , a.y * b , a.z * b , a.w * b ); } + template __forceinline Vec4 operator /( const Vec4& a, const Vec4& b ) { return Vec4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } + template __forceinline Vec4 operator /( const Vec4& a, const T& b ) { return Vec4(a.x / b , a.y / b , a.z / b , a.w / b ); } + template __forceinline Vec4 operator /( const T& a, const Vec4& b ) { return Vec4(a / b.x, a / b.y, a / b.z, a / b.w); } + + template __forceinline Vec4 min(const Vec4& a, const Vec4& b) { return Vec4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); } + template __forceinline Vec4 max(const Vec4& a, const Vec4& b) { return Vec4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec4 madd ( const Vec4& a, const Vec4& b, const Vec4& c) { return Vec4( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); } + template __forceinline Vec4 msub ( const Vec4& a, const Vec4& b, const Vec4& c) { return Vec4( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); } + template __forceinline Vec4 nmadd ( const Vec4& a, const Vec4& b, const Vec4& c) { return Vec4(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z),nmadd(a.w,b.w,c.w)); } + template __forceinline Vec4 nmsub ( const Vec4& a, const Vec4& b, const Vec4& c) { return Vec4(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z),nmsub(a.w,b.w,c.w)); } + + template __forceinline Vec4 madd ( const T& a, const Vec4& b, const Vec4& c) { return Vec4( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z), madd(a,b.w,c.w)); } + template __forceinline Vec4 msub ( const T& a, const Vec4& b, const Vec4& c) { return Vec4( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z), msub(a,b.w,c.w)); } + template __forceinline Vec4 nmadd ( const T& a, const Vec4& b, const Vec4& c) { return Vec4(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z),nmadd(a,b.w,c.w)); } + template __forceinline Vec4 nmsub ( const T& a, const Vec4& b, const Vec4& c) { return Vec4(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z),nmsub(a,b.w,c.w)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec4& operator +=( Vec4& a, const Vec4& b ) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; } + template __forceinline Vec4& operator -=( Vec4& a, const Vec4& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; } + template __forceinline Vec4& operator *=( Vec4& a, const T& b ) { a.x *= b ; a.y *= b ; a.z *= b ; a.w *= b ; return a; } + template __forceinline Vec4& operator /=( Vec4& a, const T& b ) { a.x /= b ; a.y /= b ; a.z /= b ; a.w /= b ; return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline T reduce_add( const Vec4& a ) { return a.x + a.y + a.z + a.w; } + template __forceinline T reduce_mul( const Vec4& a ) { return a.x * a.y * a.z * a.w; } + template __forceinline T reduce_min( const Vec4& a ) { return min(a.x, a.y, a.z, a.w); } + template __forceinline T reduce_max( const Vec4& a ) { return max(a.x, a.y, a.z, a.w); } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline bool operator ==( const Vec4& a, const Vec4& b ) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; } + template __forceinline bool operator !=( const Vec4& a, const Vec4& b ) { return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; } + template __forceinline bool operator < ( const Vec4& a, const Vec4& b ) { + if (a.x != b.x) return a.x < b.x; + if (a.y != b.y) return a.y < b.y; + if (a.z != b.z) return a.z < b.z; + if (a.w != b.w) return a.w < b.w; + return false; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Shift Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec4 shift_right_1( const Vec4& a ) { + return Vec4(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z),shift_right_1(a.w)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline T dot ( const Vec4& a, const Vec4& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); } + + template __forceinline T length ( const Vec4& a ) { return sqrt(dot(a,a)); } + template __forceinline Vec4 normalize( const Vec4& a ) { return a*rsqrt(dot(a,a)); } + template __forceinline T distance ( const Vec4& a, const Vec4& b ) { return length(a-b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline Vec4 select ( bool s, const Vec4& t, const Vec4& f ) { + return Vec4(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w)); + } + + template __forceinline Vec4 select ( const Vec4& s, const Vec4& t, const Vec4& f ) { + return Vec4(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z),select(s.w,t.w,f.w)); + } + + template __forceinline Vec4 select ( const typename T::Bool& s, const Vec4& t, const Vec4& f ) { + return Vec4(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w)); + } + + template + __forceinline Vec4 lerp(const Vec4& v0, const Vec4& v1, const T& t) { + return madd(Vec4(T(1.0f)-t),v0,t*v1); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + template __forceinline embree_ostream operator<<(embree_ostream cout, const Vec4& a) { + return cout << "(" << a.x << ", " << a.y << ", " << a.z << ", " << a.w << ")"; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Default template instantiations + //////////////////////////////////////////////////////////////////////////////// + + typedef Vec4 Vec4b; + typedef Vec4 Vec4uc; + typedef Vec4 Vec4i; + typedef Vec4 Vec4f; +} + +#include "vec3ba.h" +#include "vec3ia.h" +#include "vec3fa.h" + +//////////////////////////////////////////////////////////////////////////////// +/// SSE / AVX / MIC specializations +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__SSE__) || defined(__ARM_NEON) +#include "../simd/sse.h" +#endif + +#if defined __AVX__ +#include "../simd/avx.h" +#endif + +#if defined __AVX512F__ +#include "../simd/avx512.h" +#endif + +namespace embree +{ + template<> __forceinline Vec4::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; } + +#if defined(__AVX__) + template<> __forceinline Vec4::Vec4( const Vec3fx& a ) { + x = a.x; y = a.y; z = a.z; w = a.w; + } +#elif defined(__SSE__) || defined(__ARM_NEON) + template<> __forceinline Vec4::Vec4( const Vec3fx& a ) { + const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v); + } +#endif + +#if defined(__SSE__) || defined(__ARM_NEON) + __forceinline Vec4 broadcast4f( const Vec4& a, const size_t k ) { + return Vec4(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k])); + } +#endif + +#if defined(__AVX__) + template<> __forceinline Vec4::Vec4( const Vec3fx& a ) { + x = a.x; y = a.y; z = a.z; w = a.w; + } + __forceinline Vec4 broadcast4f( const Vec4& a, const size_t k ) { + return Vec4(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k])); + } + __forceinline Vec4 broadcast8f( const Vec4& a, const size_t k ) { + return Vec4(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]), vfloat8::broadcast(&a.w[k])); + } + __forceinline Vec4 broadcast8f( const Vec4& a, const size_t k ) { + return Vec4(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]), vfloat8::broadcast(&a.w[k])); + } +#endif + +#if defined(__AVX512F__) + template<> __forceinline Vec4::Vec4( const Vec3fx& a ) : x(a.x), y(a.y), z(a.z), w(a.w) {} +#endif +} diff --git a/thirdparty/embree-aarch64/common/simd/avx.h b/thirdparty/embree-aarch64/common/simd/avx.h new file mode 100644 index 00000000000..c840e418058 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/avx.h @@ -0,0 +1,34 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "sse.h" + +#if defined(__AVX512VL__) +#include "vboolf8_avx512.h" +#include "vboold4_avx512.h" +#else +#include "vboolf8_avx.h" +#include "vboold4_avx.h" +#endif + +#if defined(__AVX2__) +#include "vint8_avx2.h" +#include "vuint8_avx2.h" +#if defined(__X86_64__) +#include "vllong4_avx2.h" +#endif +#else +#include "vint8_avx.h" +#include "vuint8_avx.h" +#endif +#include "vfloat8_avx.h" +#if defined(__X86_64__) +#include "vdouble4_avx.h" +#endif + +#if defined(__AVX512F__) +#include "avx512.h" +#endif + diff --git a/thirdparty/embree-aarch64/common/simd/avx512.h b/thirdparty/embree-aarch64/common/simd/avx512.h new file mode 100644 index 00000000000..25414ab5b1d --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/avx512.h @@ -0,0 +1,41 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/intrinsics.h" +#include "../math/constants.h" +#include "../sys/alloc.h" +#include "varying.h" + +#include "vboolf16_avx512.h" +#include "vint16_avx512.h" +#include "vuint16_avx512.h" +#include "vfloat16_avx512.h" + +#include "vboold8_avx512.h" +#include "vllong8_avx512.h" +#include "vdouble8_avx512.h" + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// Prefetching + //////////////////////////////////////////////////////////////////////////////// + +#define PFHINT_L1 0 +#define PFHINT_L2 1 +#define PFHINT_NT 2 + + template + __forceinline void prefetch(const void * __restrict__ const m) + { + if (mode == PFHINT_L1) + _mm_prefetch((const char*)m,_MM_HINT_T0); + else if (mode == PFHINT_L2) + _mm_prefetch((const char*)m,_MM_HINT_T1); + else if (mode == PFHINT_NT) + _mm_prefetch((const char*)m,_MM_HINT_NTA); + } +} diff --git a/thirdparty/embree-aarch64/common/simd/simd.h b/thirdparty/embree-aarch64/common/simd/simd.h new file mode 100644 index 00000000000..647851110b8 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/simd.h @@ -0,0 +1,110 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../math/math.h" + +/* include SSE wrapper classes */ +#if defined(__SSE__) || defined(__ARM_NEON) +# include "sse.h" +#endif + +/* include AVX wrapper classes */ +#if defined(__AVX__) +# include "avx.h" +#endif + +/* include AVX512 wrapper classes */ +#if defined (__AVX512F__) +# include "avx512.h" +#endif + +namespace embree +{ + template + __forceinline vbool isfinite(const vfloat& v) + { + return (v >= vfloat(-std::numeric_limits::max())) + & (v <= vfloat( std::numeric_limits::max())); + } + + /* foreach unique */ + template + __forceinline void foreach_unique(const vbool& valid0, const vint& vi, const Closure& closure) + { + vbool valid1 = valid0; + while (any(valid1)) { + const int j = int(bsf(movemask(valid1))); + const int i = vi[j]; + const vbool valid2 = valid1 & (i == vi); + valid1 = andn(valid1, valid2); + closure(valid2, i); + } + } + + /* returns the next unique value i in vi and the corresponding valid_i mask */ + template + __forceinline int next_unique(vbool& valid, const vint& vi, /*out*/ vbool& valid_i) + { + assert(any(valid)); + const int j = int(bsf(movemask(valid))); + const int i = vi[j]; + valid_i = valid & (i == vi); + valid = andn(valid, valid_i); + return i; + } + + /* foreach unique index */ + template + __forceinline void foreach_unique_index(const vbool& valid0, const vint& vi, const Closure& closure) + { + vbool valid1 = valid0; + while (any(valid1)) { + const int j = int(bsf(movemask(valid1))); + const int i = vi[j]; + const vbool valid2 = valid1 & (i == vi); + valid1 = andn(valid1, valid2); + closure(valid2, i, j); + } + } + + /* returns the index of the next unique value i in vi and the corresponding valid_i mask */ + template + __forceinline int next_unique_index(vbool& valid, const vint& vi, /*out*/ vbool& valid_i) + { + assert(any(valid)); + const int j = int(bsf(movemask(valid))); + const int i = vi[j]; + valid_i = valid & (i == vi); + valid = andn(valid, valid_i); + return j; + } + + template + __forceinline void foreach2(int x0, int x1, int y0, int y1, const Closure& closure) + { + __aligned(64) int U[2*VSIZEX]; + __aligned(64) int V[2*VSIZEX]; + int index = 0; + for (int y=y0; y=y1; + const vintx vy = y; + for (int x=x0; x= x1; + vintx vx = x+vintx(step); + vintx::storeu(&U[index], vx); + vintx::storeu(&V[index], vy); + const int dx = min(x1-x,VSIZEX); + index += dx; + x += dx; + if (index >= VSIZEX || (lastx && lasty)) { + const vboolx valid = vintx(step) < vintx(index); + closure(valid, vintx::load(U), vintx::load(V)); + x-= max(0, index-VSIZEX); + index = 0; + } + } + } + } +} diff --git a/thirdparty/embree-aarch64/common/simd/sse.cpp b/thirdparty/embree-aarch64/common/simd/sse.cpp new file mode 100644 index 00000000000..1732cfa4215 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/sse.cpp @@ -0,0 +1,34 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "sse.h" + +namespace embree +{ + const __m128 mm_lookupmask_ps[16] = { + _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)), + _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1)) + }; + + const __m128d mm_lookupmask_pd[4] = { + _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)), + _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)), + _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)), + _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1)) + }; + +} diff --git a/thirdparty/embree-aarch64/common/simd/sse.h b/thirdparty/embree-aarch64/common/simd/sse.h new file mode 100644 index 00000000000..6bc818b55bb --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/sse.h @@ -0,0 +1,35 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/intrinsics.h" +#include "../sys/alloc.h" +#include "../math/constants.h" +#include "varying.h" + +namespace embree +{ +#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) + __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { + return _mm_blendv_ps(f,t,mask); + } +#else + __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { + return _mm_or_ps(_mm_and_ps(mask, t), _mm_andnot_ps(mask, f)); + } +#endif + + extern const __m128 mm_lookupmask_ps[16]; + extern const __m128d mm_lookupmask_pd[4]; +} + +#if defined(__AVX512VL__) +#include "vboolf4_avx512.h" +#else +#include "vboolf4_sse2.h" +#endif +#include "vint4_sse2.h" +#include "vuint4_sse2.h" +#include "vfloat4_sse2.h" diff --git a/thirdparty/embree-aarch64/common/simd/varying.h b/thirdparty/embree-aarch64/common/simd/varying.h new file mode 100644 index 00000000000..9a46817da91 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/varying.h @@ -0,0 +1,132 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" + +namespace embree +{ + /* Varying numeric types */ + template + struct vfloat + { + union { float f[N]; int i[N]; }; + __forceinline const float& operator [](size_t index) const { assert(index < N); return f[index]; } + __forceinline float& operator [](size_t index) { assert(index < N); return f[index]; } + }; + + template + struct vdouble + { + union { double f[N]; long long i[N]; }; + __forceinline const double& operator [](size_t index) const { assert(index < N); return f[index]; } + __forceinline double& operator [](size_t index) { assert(index < N); return f[index]; } + }; + + template + struct vint + { + int i[N]; + __forceinline const int& operator [](size_t index) const { assert(index < N); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < N); return i[index]; } + }; + + template + struct vuint + { + unsigned int i[N]; + __forceinline const unsigned int& operator [](size_t index) const { assert(index < N); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < N); return i[index]; } + }; + + template + struct vllong + { + long long i[N]; + __forceinline const long long& operator [](size_t index) const { assert(index < N); return i[index]; } + __forceinline long long& operator [](size_t index) { assert(index < N); return i[index]; } + }; + + /* Varying bool types */ + template struct vboolf { int i[N]; }; // for float/int + template struct vboold { long long i[N]; }; // for double/long long + + /* Aliases to default types */ + template using vreal = vfloat; + template using vbool = vboolf; + + /* Varying size constants */ +#if defined(__AVX512VL__) // SKX + const int VSIZEX = 8; // default size + const int VSIZEL = 16; // large size +#elif defined(__AVX512F__) // KNL + const int VSIZEX = 16; + const int VSIZEL = 16; +#elif defined(__AVX__) + const int VSIZEX = 8; + const int VSIZEL = 8; +#else + const int VSIZEX = 4; + const int VSIZEL = 4; +#endif + + /* Extends varying size N to optimal or up to max(N, N2) */ + template + struct vextend + { +#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL + /* use 16-wide SIMD calculations on KNL even for 4 and 8 wide SIMD */ + static const int size = (N2 == VSIZEX) ? VSIZEX : N; + #define SIMD_MODE(N) N, 16 +#else + /* calculate with same SIMD width otherwise */ + static const int size = N; + #define SIMD_MODE(N) N, N +#endif + }; + + /* 4-wide shortcuts */ + typedef vfloat<4> vfloat4; + typedef vdouble<4> vdouble4; + typedef vreal<4> vreal4; + typedef vint<4> vint4; + typedef vuint<4> vuint4; + typedef vllong<4> vllong4; + typedef vbool<4> vbool4; + typedef vboolf<4> vboolf4; + typedef vboold<4> vboold4; + + /* 8-wide shortcuts */ + typedef vfloat<8> vfloat8; + typedef vdouble<8> vdouble8; + typedef vreal<8> vreal8; + typedef vint<8> vint8; + typedef vuint<8> vuint8; + typedef vllong<8> vllong8; + typedef vbool<8> vbool8; + typedef vboolf<8> vboolf8; + typedef vboold<8> vboold8; + + /* 16-wide shortcuts */ + typedef vfloat<16> vfloat16; + typedef vdouble<16> vdouble16; + typedef vreal<16> vreal16; + typedef vint<16> vint16; + typedef vuint<16> vuint16; + typedef vllong<16> vllong16; + typedef vbool<16> vbool16; + typedef vboolf<16> vboolf16; + typedef vboold<16> vboold16; + + /* Default shortcuts */ + typedef vfloat vfloatx; + typedef vdouble vdoublex; + typedef vreal vrealx; + typedef vint vintx; + typedef vuint vuintx; + typedef vllong vllongx; + typedef vbool vboolx; + typedef vboolf vboolfx; + typedef vboold vbooldx; +} diff --git a/thirdparty/embree-aarch64/common/simd/vboold4_avx.h b/thirdparty/embree-aarch64/common/simd/vboold4_avx.h new file mode 100644 index 00000000000..6505ee56f38 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboold4_avx.h @@ -0,0 +1,160 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 4-wide AVX bool type for 64bit data types*/ + template<> + struct vboold<4> + { + ALIGNED_STRUCT_(32); + + typedef vboold4 Bool; + + enum { size = 4 }; // number of SIMD elements + union { // data + __m256d v; + struct { __m128d vl,vh; }; + long long i[4]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold() {} + __forceinline vboold(const vboold4& a) { v = a.v; } + __forceinline vboold4& operator =(const vboold4& a) { v = a.v; return *this; } + + __forceinline vboold(__m256d a) : v(a) {} + __forceinline vboold(__m256i a) : v(_mm256_castsi256_pd(a)) {} + + __forceinline operator const __m256() const { return _mm256_castpd_ps(v); } + __forceinline operator const __m256i() const { return _mm256_castpd_si256(v); } + __forceinline operator const __m256d() const { return v; } + + __forceinline vboold(int a) + { + assert(a >= 0 && a <= 255); +#if defined (__AVX2__) + const __m256i mask = _mm256_set_epi64x(0x8, 0x4, 0x2, 0x1); + const __m256i b = _mm256_set1_epi64x(a); + const __m256i c = _mm256_and_si256(b,mask); + v = _mm256_castsi256_pd(_mm256_cmpeq_epi64(c,mask)); +#else + vl = mm_lookupmask_pd[a & 0x3]; + vh = mm_lookupmask_pd[a >> 2]; +#endif + } + + __forceinline vboold(__m128d a, __m128d b) : vl(a), vh(b) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {} +#if !defined(__aarch64__) + __forceinline vboold(TrueTy) : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {} +#else + __forceinline vboold(TrueTy) : v(_mm256_cmpeq_pd(_mm256_setzero_pd(), _mm256_setzero_pd())) {} +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm256_movemask_pd(v) >> index) & 1; } + __forceinline long long& operator [](size_t index) { assert(index < 4); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !(const vboold4& a) { return _mm256_xor_pd(a, vboold4(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm256_and_pd(a, b); } + __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm256_or_pd (a, b); } + __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); } + + __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm256_andnot_pd(b, a); } + + __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; } + __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; } + __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); } + __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(_mm256_xor_pd(a,b),vboold4(embree::True)); } + + __forceinline vboold4 select(const vboold4& mask, const vboold4& t, const vboold4& f) { + return _mm256_blendv_pd(f, t, mask); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + +#if !defined(__aarch64__) + __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); } + __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); } +#endif + +#if defined(__AVX2__) + template + __forceinline vboold4 shuffle(const vboold4& v) { + return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template + __forceinline vboold4 shuffle(const vboold4& v) { + return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i, i, i, i)); + } +#endif + + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and(const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; } + __forceinline bool reduce_or (const vboold4& a) { return !_mm256_testz_pd(a,a); } + + __forceinline bool all (const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; } + __forceinline bool any (const vboold4& a) { return !_mm256_testz_pd(a,a); } + __forceinline bool none(const vboold4& a) { return _mm256_testz_pd(a,a) != 0; } + + __forceinline bool all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); } + __forceinline bool any (const vboold4& valid, const vboold4& b) { return any(valid & b); } + __forceinline bool none(const vboold4& valid, const vboold4& b) { return none(valid & b); } + + __forceinline unsigned int movemask(const vboold4& a) { return _mm256_movemask_pd(a); } + __forceinline size_t popcnt (const vboold4& a) { return popcnt((size_t)_mm256_movemask_pd(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboold4& a, size_t index) { return a[index]; } + __forceinline void set (vboold4& a, size_t index) { a[index] = -1; } + __forceinline void clear(vboold4& a, size_t index) { a[index] = 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " + << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h b/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h new file mode 100644 index 00000000000..4fe730d713d --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h @@ -0,0 +1,140 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 4-wide AVX-512 bool type */ + template<> + struct vboold<4> + { + typedef vboold4 Bool; + typedef vint4 Int; + + enum { size = 4 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold() {} + __forceinline vboold(const vboold4& t) { v = t.v; } + __forceinline vboold4& operator =(const vboold4& f) { v = f.v; return *this; } + + __forceinline vboold(const __mmask8 &t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboold(bool b) { v = b ? 0xf : 0x0; } + __forceinline vboold(int t) { v = (__mmask8)t; } + __forceinline vboold(unsigned int t) { v = (__mmask8)t; } + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int32 mask */ + __forceinline __m128i mask32() const { + return _mm_movm_epi32(v); + } + + /* return int64 mask */ + __forceinline __m256i mask64() const { + return _mm256_movm_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold(FalseTy) : v(0x0) {} + __forceinline vboold(TrueTy) : v(0xf) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 4); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !(const vboold4& a) { return _mm512_kandn(a, 0xf); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm512_kand(a, b); } + __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm512_kor(a, b); } + __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); } + + __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; } + __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; } + __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); } + __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); } + + __forceinline vboold4 select(const vboold4& s, const vboold4& a, const vboold4& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboold4& a) { return a.v == 0xf; } + __forceinline int any (const vboold4& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboold4& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); } + __forceinline int any (const vboold4& valid, const vboold4& b) { return any(valid & b); } + __forceinline int none(const vboold4& valid, const vboold4& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboold4& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboold4& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboold4& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboold4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; } + __forceinline void set(vboold4& a, size_t index) { assert(index < 4); a |= 1 << index; } + __forceinline void clear(vboold4& a, size_t index) { assert(index < 4); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) + { + cout << "<"; + for (size_t i=0; i<4; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h b/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h new file mode 100644 index 00000000000..fdf3f00de54 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h @@ -0,0 +1,148 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX-512 bool type */ + template<> + struct vboold<8> + { + typedef vboold8 Bool; + typedef vint8 Int; + + enum { size = 8 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold() {} + __forceinline vboold(const vboold8& t) { v = t.v; } + __forceinline vboold8& operator =(const vboold8& f) { v = f.v; return *this; } + + __forceinline vboold(const __mmask8& t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboold(bool b) { v = b ? 0xff : 0x00; } + __forceinline vboold(int t) { v = (__mmask8)t; } + __forceinline vboold(unsigned int t) { v = (__mmask8)t; } + + /* return int8 mask */ + __forceinline __m128i mask8() const { +#if defined(__AVX512BW__) + return _mm_movm_epi8(v); +#else + const __m512i f = _mm512_set1_epi64(0); + const __m512i t = _mm512_set1_epi64(-1); + const __m512i m = _mm512_mask_or_epi64(f,v,t,t); + return _mm512_cvtepi64_epi8(m); +#endif + } + + /* return int64 mask */ + __forceinline __m512i mask64() const { +#if defined(__AVX512DQ__) + return _mm512_movm_epi64(v); +#else + const __m512i f = _mm512_set1_epi64(0); + const __m512i t = _mm512_set1_epi64(-1); + return _mm512_mask_or_epi64(f,v,t,t); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold(FalseTy) : v(0x00) {} + __forceinline vboold(TrueTy) : v(0xff) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 8); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator !(const vboold8& a) { return _mm512_knot(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator &(const vboold8& a, const vboold8& b) { return _mm512_kand(a, b); } + __forceinline vboold8 operator |(const vboold8& a, const vboold8& b) { return _mm512_kor(a, b); } + __forceinline vboold8 operator ^(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); } + + __forceinline vboold8 andn(const vboold8& a, const vboold8& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8& operator &=(vboold8& a, const vboold8& b) { return a = a & b; } + __forceinline vboold8& operator |=(vboold8& a, const vboold8& b) { return a = a | b; } + __forceinline vboold8& operator ^=(vboold8& a, const vboold8& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator !=(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); } + __forceinline vboold8 operator ==(const vboold8& a, const vboold8& b) { return _mm512_kxnor(a, b); } + + __forceinline vboold8 select(const vboold8& s, const vboold8& a, const vboold8& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboold8& a) { return a.v == 0xff; } + __forceinline int any (const vboold8& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboold8& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboold8& valid, const vboold8& b) { return all((!valid) | b); } + __forceinline int any (const vboold8& valid, const vboold8& b) { return any(valid & b); } + __forceinline int none(const vboold8& valid, const vboold8& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboold8& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboold8& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboold8& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboold8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; } + __forceinline void set(vboold8& a, size_t index) { assert(index < 8); a |= 1 << index; } + __forceinline void clear(vboold8& a, size_t index) { assert(index < 8); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboold8& a) + { + cout << "<"; + for (size_t i=0; i<8; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h new file mode 100644 index 00000000000..238cdc8eb92 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h @@ -0,0 +1,150 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 16-wide AVX-512 bool type */ + template<> + struct vboolf<16> + { + typedef vboolf16 Bool; + typedef vint16 Int; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + __mmask16 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf16& t) { v = t.v; } + __forceinline vboolf16& operator =(const vboolf16& f) { v = f.v; return *this; } + + __forceinline vboolf(const __mmask16& t) { v = t; } + __forceinline operator __mmask16() const { return v; } + + __forceinline vboolf(bool b) { v = b ? 0xFFFF : 0x0000; } + __forceinline vboolf(int t) { v = (__mmask16)t; } + __forceinline vboolf(unsigned int t) { v = (__mmask16)t; } + + /* return int8 mask */ + __forceinline __m128i mask8() const { +#if defined(__AVX512BW__) + return _mm_movm_epi8(v); +#else + const __m512i f = _mm512_set1_epi32(0); + const __m512i t = _mm512_set1_epi32(-1); + const __m512i m = _mm512_mask_or_epi32(f,v,t,t); + return _mm512_cvtepi32_epi8(m); +#endif + } + + /* return int32 mask */ + __forceinline __m512i mask32() const { +#if defined(__AVX512DQ__) + return _mm512_movm_epi32(v); +#else + const __m512i f = _mm512_set1_epi32(0); + const __m512i t = _mm512_set1_epi32(-1); + return _mm512_mask_or_epi32(f,v,t,t); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(0x0000) {} + __forceinline vboolf(TrueTy) : v(0xffff) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 16); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator !(const vboolf16& a) { return _mm512_knot(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator &(const vboolf16& a, const vboolf16& b) { return _mm512_kand(a,b); } + __forceinline vboolf16 operator |(const vboolf16& a, const vboolf16& b) { return _mm512_kor(a,b); } + __forceinline vboolf16 operator ^(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a,b); } + + __forceinline vboolf16 andn(const vboolf16& a, const vboolf16& b) { return _mm512_kandn(b,a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16& operator &=(vboolf16& a, const vboolf16& b) { return a = a & b; } + __forceinline vboolf16& operator |=(vboolf16& a, const vboolf16& b) { return a = a | b; } + __forceinline vboolf16& operator ^=(vboolf16& a, const vboolf16& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator !=(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a, b); } + __forceinline vboolf16 operator ==(const vboolf16& a, const vboolf16& b) { return _mm512_kxnor(a, b); } + + __forceinline vboolf16 select(const vboolf16& s, const vboolf16& a, const vboolf16& b) { + return _mm512_kor(_mm512_kand(s,a),_mm512_kandn(s,b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboolf16& a) { return _mm512_kortestc(a,a) != 0; } + __forceinline int any (const vboolf16& a) { return _mm512_kortestz(a,a) == 0; } + __forceinline int none(const vboolf16& a) { return _mm512_kortestz(a,a) != 0; } + + __forceinline int all (const vboolf16& valid, const vboolf16& b) { return all((!valid) | b); } + __forceinline int any (const vboolf16& valid, const vboolf16& b) { return any(valid & b); } + __forceinline int none(const vboolf16& valid, const vboolf16& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf16& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboolf16& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Convertion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); } + __forceinline vboolf16 toMask(const int& a) { return mm512_int2mask(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf16& a, size_t index) { assert(index < 16); return (toInt(a) >> index) & 1; } + __forceinline void set(vboolf16& a, size_t index) { assert(index < 16); a |= 1 << index; } + __forceinline void clear(vboolf16& a, size_t index) { assert(index < 16); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf16& a) + { + cout << "<"; + for (size_t i=0; i<16; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h new file mode 100644 index 00000000000..2ae4c4470e2 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h @@ -0,0 +1,143 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 4-wide AVX-512 bool type */ + template<> + struct vboolf<4> + { + typedef vboolf4 Bool; + typedef vint4 Int; + + enum { size = 4 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf4& t) { v = t.v; } + __forceinline vboolf4& operator =(const vboolf4& f) { v = f.v; return *this; } + + __forceinline vboolf(const __mmask8 &t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboolf(bool b) { v = b ? 0xf : 0x0; } + __forceinline vboolf(int t) { v = (__mmask8)t; } + __forceinline vboolf(unsigned int t) { v = (__mmask8)t; } + + __forceinline vboolf(bool a, bool b, bool c, bool d) + : v((__mmask8)((int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {} + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int32 mask */ + __forceinline __m128i mask32() const { + return _mm_movm_epi32(v); + } + + /* return int64 mask */ + __forceinline __m256i mask64() const { + return _mm256_movm_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(0x0) {} + __forceinline vboolf(TrueTy) : v(0xf) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 4); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !(const vboolf4& a) { return _mm512_kandn(a, 0xf); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm512_kand(a, b); } + __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm512_kor(a, b); } + __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); } + + __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; } + __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; } + __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); } + __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); } + + __forceinline vboolf4 select(const vboolf4& s, const vboolf4& a, const vboolf4& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboolf4& a) { return a.v == 0xf; } + __forceinline int any (const vboolf4& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboolf4& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); } + __forceinline int any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); } + __forceinline int none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf4& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboolf4& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboolf4& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; } + __forceinline void set(vboolf4& a, size_t index) { assert(index < 4); a |= 1 << index; } + __forceinline void clear(vboolf4& a, size_t index) { assert(index < 4); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) + { + cout << "<"; + for (size_t i=0; i<4; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h b/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h new file mode 100644 index 00000000000..ed53b3c783f --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h @@ -0,0 +1,198 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 4-wide SSE bool type */ + template<> + struct vboolf<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128 v; int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf4& other) { v = other.v; } + __forceinline vboolf4& operator =(const vboolf4& other) { v = other.v; return *this; } + + __forceinline vboolf(__m128 input) : v(input) {} + __forceinline operator const __m128&() const { return v; } + __forceinline operator const __m128i() const { return _mm_castps_si128(v); } + __forceinline operator const __m128d() const { return _mm_castps_pd(v); } + + __forceinline vboolf(bool a) + : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} + __forceinline vboolf(bool a, bool b) + : v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {} + __forceinline vboolf(bool a, bool b, bool c, bool d) + : v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} +#if defined(__aarch64__) && defined(BUILD_IOS) + __forceinline vboolf(int mask) { v = mm_lookupmask_ps[mask]; } + __forceinline vboolf(unsigned int mask) { v = mm_lookupmask_ps[mask]; } +#else + __forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; } + __forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; } +#endif + /* return int32 mask */ + __forceinline __m128i mask32() const { + return _mm_castps_si128(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(_mm_setzero_ps()) {} + __forceinline vboolf(TrueTy) : v(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) && defined(BUILD_IOS) + __forceinline bool operator [](size_t index) const { return (_mm_movemask_ps(v) >> index) & 1; } + __forceinline int& operator [](size_t index) { return i[index]; } +#else + __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; } + __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; } +#endif + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !(const vboolf4& a) { return _mm_xor_ps(a, vboolf4(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm_and_ps(a, b); } + __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm_or_ps (a, b); } + __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); } + + __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm_andnot_ps(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; } + __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; } + __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); } + __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } + + __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) { +#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) + return _mm_blendv_ps(f, t, m); +#else + return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); } + __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); } + +#if defined(__aarch64__) + template + __forceinline vboolf4 shuffle(const vboolf4& v) { + return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v), _MN_SHUFFLE(i0, i1, i2, i3))); + } + + template + __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { + return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); + } +#else + template + __forceinline vboolf4 shuffle(const vboolf4& v) { + return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { + return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } +#endif + + template + __forceinline vboolf4 shuffle(const vboolf4& v) { + return shuffle(v); + } + +#if defined(__SSE3__) + template<> __forceinline vboolf4 shuffle<0, 0, 2, 2>(const vboolf4& v) { return _mm_moveldup_ps(v); } + template<> __forceinline vboolf4 shuffle<1, 1, 3, 3>(const vboolf4& v) { return _mm_movehdup_ps(v); } + template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); } +#endif + +#if defined(__SSE4_1__) && !defined(__aarch64__) + template __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } + template __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert(a, b); } + template __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert(a, vboolf4(b)); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and(const vboolf4& a) { return _mm_movemask_ps(a) == 0xf; } + __forceinline bool reduce_or (const vboolf4& a) { return _mm_movemask_ps(a) != 0x0; } + + __forceinline bool all (const vboolf4& b) { return _mm_movemask_ps(b) == 0xf; } + __forceinline bool any (const vboolf4& b) { return _mm_movemask_ps(b) != 0x0; } + __forceinline bool none(const vboolf4& b) { return _mm_movemask_ps(b) == 0x0; } + + __forceinline bool all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); } + __forceinline bool any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); } + __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); } +#if defined(__aarch64__) && defined(BUILD_IOS) +__forceinline size_t popcnt(const vboolf4& a) { return _mm_movemask_popcnt_ps(a); } +#else +#if defined(__SSE4_2__) + __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); } +#else + __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); } +#endif +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf4& a, size_t index) { return a[index]; } + __forceinline void set(vboolf4& a, size_t index) { a[index] = -1; } + __forceinline void clear(vboolf4& a, size_t index) { a[index] = 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h b/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h new file mode 100644 index 00000000000..4f64741b55a --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h @@ -0,0 +1,189 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX bool type */ + template<> + struct vboolf<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256 v; + struct { __m128 vl,vh; }; + int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf8& a) { v = a.v; } + __forceinline vboolf8& operator =(const vboolf8& a) { v = a.v; return *this; } + + __forceinline vboolf(__m256 a) : v(a) {} + __forceinline operator const __m256&() const { return v; } + __forceinline operator const __m256i() const { return _mm256_castps_si256(v); } + __forceinline operator const __m256d() const { return _mm256_castps_pd(v); } + + __forceinline vboolf(int a) + { + assert(a >= 0 && a <= 255); +#if defined (__AVX2__) + const __m256i mask = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1); + const __m256i b = _mm256_set1_epi32(a); + const __m256i c = _mm256_and_si256(b,mask); + v = _mm256_castsi256_ps(_mm256_cmpeq_epi32(c,mask)); +#else + vl = mm_lookupmask_ps[a & 0xF]; + vh = mm_lookupmask_ps[a >> 4]; +#endif + } + + __forceinline vboolf(const vboolf4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} + __forceinline vboolf(const vboolf4& a, const vboolf4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} + __forceinline vboolf(__m128 a, __m128 b) : vl(a), vh(b) {} + + __forceinline vboolf(bool a) : v(vboolf8(vboolf4(a), vboolf4(a))) {} + __forceinline vboolf(bool a, bool b) : v(vboolf8(vboolf4(a), vboolf4(b))) {} + __forceinline vboolf(bool a, bool b, bool c, bool d) : v(vboolf8(vboolf4(a,b), vboolf4(c,d))) {} + __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) : v(vboolf8(vboolf4(a,b,c,d), vboolf4(e,f,g,h))) {} + + /* return int32 mask */ + __forceinline __m256i mask32() const { + return _mm256_castps_si256(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {} +#if !defined(__aarch64__) + __forceinline vboolf(TrueTy) : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {} +#else + __forceinline vboolf(TrueTy) : v(_mm256_cmpeq_ps(_mm256_setzero_ps(), _mm256_setzero_ps())) {} +#endif + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { assert(index < 8); return (_mm256_movemask_ps(v) >> index) & 1; } + __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !(const vboolf8& a) { return _mm256_xor_ps(a, vboolf8(embree::True)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm256_and_ps(a, b); } + __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm256_or_ps (a, b); } + __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); } + + __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm256_andnot_ps(b, a); } + + __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; } + __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; } + __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); } + __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(_mm256_xor_ps(a,b),vboolf8(embree::True)); } + + __forceinline vboolf8 select(const vboolf8& mask, const vboolf8& t, const vboolf8& f) { + return _mm256_blendv_ps(f, t, mask); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 unpacklo(const vboolf8& a, const vboolf8& b) { return _mm256_unpacklo_ps(a, b); } + __forceinline vboolf8 unpackhi(const vboolf8& a, const vboolf8& b) { return _mm256_unpackhi_ps(a, b); } + + template + __forceinline vboolf8 shuffle(const vboolf8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); + } + + template + __forceinline vboolf8 shuffle4(const vboolf8& v) { + return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vboolf8 shuffle4(const vboolf8& a, const vboolf8& b) { + return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vboolf8 shuffle(const vboolf8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template + __forceinline vboolf8 shuffle(const vboolf8& a, const vboolf8& b) { + return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template<> __forceinline vboolf8 shuffle<0, 0, 2, 2>(const vboolf8& v) { return _mm256_moveldup_ps(v); } + template<> __forceinline vboolf8 shuffle<1, 1, 3, 3>(const vboolf8& v) { return _mm256_movehdup_ps(v); } + template<> __forceinline vboolf8 shuffle<0, 1, 0, 1>(const vboolf8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); } + + template __forceinline vboolf8 insert4(const vboolf8& a, const vboolf4& b) { return _mm256_insertf128_ps(a, b, i); } + template __forceinline vboolf4 extract4 (const vboolf8& a) { return _mm256_extractf128_ps(a, i); } + template<> __forceinline vboolf4 extract4<0>(const vboolf8& a) { return _mm256_castps256_ps128(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool reduce_and(const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; } + __forceinline bool reduce_or (const vboolf8& a) { return !_mm256_testz_ps(a,a); } + + __forceinline bool all (const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; } + __forceinline bool any (const vboolf8& a) { return !_mm256_testz_ps(a,a); } + __forceinline bool none(const vboolf8& a) { return _mm256_testz_ps(a,a) != 0; } + + __forceinline bool all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); } + __forceinline bool any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); } + __forceinline bool none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); } + + __forceinline unsigned int movemask(const vboolf8& a) { return _mm256_movemask_ps(a); } + __forceinline size_t popcnt (const vboolf8& a) { return popcnt((size_t)_mm256_movemask_ps(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf8& a, size_t index) { return a[index]; } + __forceinline void set(vboolf8& a, size_t index) { a[index] = -1; } + __forceinline void clear(vboolf8& a, size_t index) { a[index] = 0; } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " + << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h new file mode 100644 index 00000000000..2a52b554c79 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h @@ -0,0 +1,143 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX-512 bool type */ + template<> + struct vboolf<8> + { + typedef vboolf8 Bool; + typedef vint8 Int; + + enum { size = 8 }; // number of SIMD elements + __mmask8 v; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf() {} + __forceinline vboolf(const vboolf8& t) { v = t.v; } + __forceinline vboolf8& operator =(const vboolf8& f) { v = f.v; return *this; } + + __forceinline vboolf(const __mmask8 &t) { v = t; } + __forceinline operator __mmask8() const { return v; } + + __forceinline vboolf(bool b) { v = b ? 0xff : 0x00; } + __forceinline vboolf(int t) { v = (__mmask8)t; } + __forceinline vboolf(unsigned int t) { v = (__mmask8)t; } + + __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) + : v((__mmask8)((int(h) << 7) | (int(g) << 6) | (int(f) << 5) | (int(e) << 4) | (int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {} + + /* return int8 mask */ + __forceinline __m128i mask8() const { + return _mm_movm_epi8(v); + } + + /* return int32 mask */ + __forceinline __m256i mask32() const { + return _mm256_movm_epi32(v); + } + + /* return int64 mask */ + __forceinline __m512i mask64() const { + return _mm512_movm_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf(FalseTy) : v(0x00) {} + __forceinline vboolf(TrueTy) : v(0xff) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool operator [](size_t index) const { + assert(index < 8); return (mm512_mask2int(v) >> index) & 1; + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !(const vboolf8& a) { return _mm512_knot(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm512_kand(a, b); } + __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm512_kor(a, b); } + __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); } + + __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm512_kandn(b, a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; } + __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; } + __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); } + __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm512_kxnor(a, b); } + + __forceinline vboolf8 select(const vboolf8& s, const vboolf8& a, const vboolf8& b) { + return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reduction Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int all (const vboolf8& a) { return a.v == 0xff; } + __forceinline int any (const vboolf8& a) { return _mm512_kortestz(a, a) == 0; } + __forceinline int none(const vboolf8& a) { return _mm512_kortestz(a, a) != 0; } + + __forceinline int all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); } + __forceinline int any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); } + __forceinline int none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); } + + __forceinline size_t movemask(const vboolf8& a) { return _mm512_kmov(a); } + __forceinline size_t popcnt (const vboolf8& a) { return popcnt(a.v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Conversion Operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int toInt(const vboolf8& a) { return mm512_mask2int(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Get/Set Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline bool get(const vboolf8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; } + __forceinline void set(vboolf8& a, size_t index) { assert(index < 8); a |= 1 << index; } + __forceinline void clear(vboolf8& a, size_t index) { assert(index < 8); a = andn(a, 1 << index); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) + { + cout << "<"; + for (size_t i=0; i<8; i++) { + if ((a.v >> i) & 1) cout << "1"; else cout << "0"; + } + return cout << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h b/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h new file mode 100644 index 00000000000..1f65b45d7e7 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h @@ -0,0 +1,324 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 4-wide AVX 64-bit double type */ + template<> + struct vdouble<4> + { + ALIGNED_STRUCT_(32); + + typedef vboold4 Bool; + + enum { size = 4 }; // number of SIMD elements + union { // data + __m256d v; + double i[4]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble() {} + __forceinline vdouble(const vdouble4& t) { v = t.v; } + __forceinline vdouble4& operator =(const vdouble4& f) { v = f.v; return *this; } + + __forceinline vdouble(const __m256d& t) { v = t; } + __forceinline operator __m256d() const { return v; } + + __forceinline vdouble(double i) { + v = _mm256_set1_pd(i); + } + + __forceinline vdouble(double a, double b, double c, double d) { + v = _mm256_set_pd(d,c,b,a); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble(ZeroTy) : v(_mm256_setzero_pd()) {} + __forceinline vdouble(OneTy) : v(_mm256_set1_pd(1)) {} + __forceinline vdouble(StepTy) : v(_mm256_set_pd(3.0,2.0,1.0,0.0)) {} + __forceinline vdouble(ReverseStepTy) : v(_mm256_setr_pd(3.0,2.0,1.0,0.0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(double *__restrict__ ptr, const vdouble4& a) { + _mm256_stream_pd(ptr, a); + } + + static __forceinline vdouble4 loadu(const double* addr) { + return _mm256_loadu_pd(addr); + } + + static __forceinline vdouble4 load(const vdouble4* addr) { + return _mm256_load_pd((double*)addr); + } + + static __forceinline vdouble4 load(const double* addr) { + return _mm256_load_pd(addr); + } + + static __forceinline void store(double* ptr, const vdouble4& v) { + _mm256_store_pd(ptr, v); + } + + static __forceinline void storeu(double* ptr, const vdouble4& v) { + _mm256_storeu_pd(ptr, v); + } + + static __forceinline vdouble4 broadcast(const void* a) { return _mm256_set1_pd(*(double*)a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline double& operator [](size_t index) { assert(index < 4); return i[index]; } + __forceinline const double& operator [](size_t index) const { assert(index < 4); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline vdouble4 asDouble(const vllong4& a) { return _mm256_castsi256_pd(a); } + __forceinline vllong4 asLLong (const vdouble4& a) { return _mm256_castpd_si256(a); } +#endif + + __forceinline vdouble4 operator +(const vdouble4& a) { return a; } + __forceinline vdouble4 operator -(const vdouble4& a) { return _mm256_sub_pd(_mm256_setzero_pd(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble4 operator +(const vdouble4& a, const vdouble4& b) { return _mm256_add_pd(a, b); } + __forceinline vdouble4 operator +(const vdouble4& a, double b) { return a + vdouble4(b); } + __forceinline vdouble4 operator +(double a, const vdouble4& b) { return vdouble4(a) + b; } + + __forceinline vdouble4 operator -(const vdouble4& a, const vdouble4& b) { return _mm256_sub_pd(a, b); } + __forceinline vdouble4 operator -(const vdouble4& a, double b) { return a - vdouble4(b); } + __forceinline vdouble4 operator -(double a, const vdouble4& b) { return vdouble4(a) - b; } + + __forceinline vdouble4 operator *(const vdouble4& a, const vdouble4& b) { return _mm256_mul_pd(a, b); } + __forceinline vdouble4 operator *(const vdouble4& a, double b) { return a * vdouble4(b); } + __forceinline vdouble4 operator *(double a, const vdouble4& b) { return vdouble4(a) * b; } + + __forceinline vdouble4 operator &(const vdouble4& a, const vdouble4& b) { return _mm256_and_pd(a, b); } + __forceinline vdouble4 operator &(const vdouble4& a, double b) { return a & vdouble4(b); } + __forceinline vdouble4 operator &(double a, const vdouble4& b) { return vdouble4(a) & b; } + + __forceinline vdouble4 operator |(const vdouble4& a, const vdouble4& b) { return _mm256_or_pd(a, b); } + __forceinline vdouble4 operator |(const vdouble4& a, double b) { return a | vdouble4(b); } + __forceinline vdouble4 operator |(double a, const vdouble4& b) { return vdouble4(a) | b; } + + __forceinline vdouble4 operator ^(const vdouble4& a, const vdouble4& b) { return _mm256_xor_pd(a, b); } + __forceinline vdouble4 operator ^(const vdouble4& a, double b) { return a ^ vdouble4(b); } + __forceinline vdouble4 operator ^(double a, const vdouble4& b) { return vdouble4(a) ^ b; } + + __forceinline vdouble4 min(const vdouble4& a, const vdouble4& b) { return _mm256_min_pd(a, b); } + __forceinline vdouble4 min(const vdouble4& a, double b) { return min(a,vdouble4(b)); } + __forceinline vdouble4 min(double a, const vdouble4& b) { return min(vdouble4(a),b); } + + __forceinline vdouble4 max(const vdouble4& a, const vdouble4& b) { return _mm256_max_pd(a, b); } + __forceinline vdouble4 max(const vdouble4& a, double b) { return max(a,vdouble4(b)); } + __forceinline vdouble4 max(double a, const vdouble4& b) { return max(vdouble4(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__FMA__) + __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmadd_pd(a,b,c); } + __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmsub_pd(a,b,c); } + __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmadd_pd(a,b,c); } + __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmsub_pd(a,b,c); } +#else + __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b+c; } + __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b-c; } + __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b+c;} + __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b-c; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble4& operator +=(vdouble4& a, const vdouble4& b) { return a = a + b; } + __forceinline vdouble4& operator +=(vdouble4& a, double b) { return a = a + b; } + + __forceinline vdouble4& operator -=(vdouble4& a, const vdouble4& b) { return a = a - b; } + __forceinline vdouble4& operator -=(vdouble4& a, double b) { return a = a - b; } + + __forceinline vdouble4& operator *=(vdouble4& a, const vdouble4& b) { return a = a * b; } + __forceinline vdouble4& operator *=(vdouble4& a, double b) { return a = a * b; } + + __forceinline vdouble4& operator &=(vdouble4& a, const vdouble4& b) { return a = a & b; } + __forceinline vdouble4& operator &=(vdouble4& a, double b) { return a = a & b; } + + __forceinline vdouble4& operator |=(vdouble4& a, const vdouble4& b) { return a = a | b; } + __forceinline vdouble4& operator |=(vdouble4& a, double b) { return a = a | b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_EQ); } + __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_NE); } + __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LT); } + __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); } + __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); } + __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); } +#elif !defined(__aarch64__) + __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } + __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); } + __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } + __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } + __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } + __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } +#else + __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a, b); } + __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a, b); } + __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a, b); } + __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a, b); } + __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a, b); } + __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a, b); } +#endif + + __forceinline vboold4 operator ==(const vdouble4& a, double b) { return a == vdouble4(b); } + __forceinline vboold4 operator ==(double a, const vdouble4& b) { return vdouble4(a) == b; } + + __forceinline vboold4 operator !=(const vdouble4& a, double b) { return a != vdouble4(b); } + __forceinline vboold4 operator !=(double a, const vdouble4& b) { return vdouble4(a) != b; } + + __forceinline vboold4 operator < (const vdouble4& a, double b) { return a < vdouble4(b); } + __forceinline vboold4 operator < (double a, const vdouble4& b) { return vdouble4(a) < b; } + + __forceinline vboold4 operator >=(const vdouble4& a, double b) { return a >= vdouble4(b); } + __forceinline vboold4 operator >=(double a, const vdouble4& b) { return vdouble4(a) >= b; } + + __forceinline vboold4 operator > (const vdouble4& a, double b) { return a > vdouble4(b); } + __forceinline vboold4 operator > (double a, const vdouble4& b) { return vdouble4(a) > b; } + + __forceinline vboold4 operator <=(const vdouble4& a, double b) { return a <= vdouble4(b); } + __forceinline vboold4 operator <=(double a, const vdouble4& b) { return vdouble4(a) <= b; } + + __forceinline vboold4 eq(const vdouble4& a, const vdouble4& b) { return a == b; } + __forceinline vboold4 ne(const vdouble4& a, const vdouble4& b) { return a != b; } + __forceinline vboold4 lt(const vdouble4& a, const vdouble4& b) { return a < b; } + __forceinline vboold4 ge(const vdouble4& a, const vdouble4& b) { return a >= b; } + __forceinline vboold4 gt(const vdouble4& a, const vdouble4& b) { return a > b; } + __forceinline vboold4 le(const vdouble4& a, const vdouble4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a == b); } + __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a != b); } + __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a < b); } + __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >= b); } + __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a > b); } + __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <= b); } +#endif + + __forceinline vdouble4 select(const vboold4& m, const vdouble4& t, const vdouble4& f) { +#if defined(__AVX512VL__) + return _mm256_mask_blend_pd(m, f, t); +#else + return _mm256_blendv_pd(f, t, m); +#endif + } + + __forceinline void xchg(const vboold4& m, vdouble4& a, vdouble4& b) { + const vdouble4 c = a; a = select(m,b,a); b = select(m,c,b); + } + + __forceinline vboold4 test(const vdouble4& a, const vdouble4& b) { +#if defined(__AVX512VL__) + return _mm256_test_epi64_mask(_mm256_castpd_si256(a),_mm256_castpd_si256(b)); +#else + return _mm256_testz_si256(_mm256_castpd_si256(a),_mm256_castpd_si256(b)); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vdouble4 shuffle(const vdouble4& v) { + return _mm256_permute_pd(v, (i1 << 3) | (i0 << 2) | (i1 << 1) | i0); + } + + template + __forceinline vdouble4 shuffle(const vdouble4& v) { + return shuffle(v); + } + + template + __forceinline vdouble4 shuffle2(const vdouble4& v) { + return _mm256_permute2f128_pd(v, v, (i1 << 4) | i0); + } + + __forceinline double toScalar(const vdouble4& v) { + return _mm_cvtsd_f64(_mm256_castpd256_pd128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble4 vreduce_min2(const vdouble4& x) { return min(x, shuffle<1,0>(x)); } + __forceinline vdouble4 vreduce_min (const vdouble4& y) { const vdouble4 x = vreduce_min2(y); return min(x, shuffle2<1,0>(x)); } + + __forceinline vdouble4 vreduce_max2(const vdouble4& x) { return max(x,shuffle<1,0>(x)); } + __forceinline vdouble4 vreduce_max (const vdouble4& y) { const vdouble4 x = vreduce_max2(y); return max(x, shuffle2<1,0>(x)); } + + __forceinline vdouble4 vreduce_and2(const vdouble4& x) { return x & shuffle<1,0>(x); } + __forceinline vdouble4 vreduce_and (const vdouble4& y) { const vdouble4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); } + + __forceinline vdouble4 vreduce_or2(const vdouble4& x) { return x | shuffle<1,0>(x); } + __forceinline vdouble4 vreduce_or (const vdouble4& y) { const vdouble4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); } + + __forceinline vdouble4 vreduce_add2(const vdouble4& x) { return x + shuffle<1,0>(x); } + __forceinline vdouble4 vreduce_add (const vdouble4& y) { const vdouble4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); } + + __forceinline double reduce_add(const vdouble4& a) { return toScalar(vreduce_add(a)); } + __forceinline double reduce_min(const vdouble4& a) { return toScalar(vreduce_min(a)); } + __forceinline double reduce_max(const vdouble4& a) { return toScalar(vreduce_max(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble4& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<4; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h b/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h new file mode 100644 index 00000000000..4eec7d2f6a0 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h @@ -0,0 +1,356 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX-512 64-bit double type */ + template<> + struct vdouble<8> + { + ALIGNED_STRUCT_(64); + + typedef vboold8 Bool; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m512d v; + double i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble() {} + __forceinline vdouble(const vdouble8& t) { v = t.v; } + __forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; } + + __forceinline vdouble(const __m512d& t) { v = t; } + __forceinline operator __m512d() const { return v; } + __forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); } + + __forceinline vdouble(double i) { + v = _mm512_set1_pd(i); + } + + __forceinline vdouble(double a, double b, double c, double d) { + v = _mm512_set4_pd(d,c,b,a); + } + + __forceinline vdouble(double a0, double a1, double a2, double a3, + double a4, double a5, double a6, double a7) + { + v = _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble(ZeroTy) : v(_mm512_setzero_pd()) {} + __forceinline vdouble(OneTy) : v(_mm512_set1_pd(1)) {} + __forceinline vdouble(StepTy) : v(_mm512_set_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {} + __forceinline vdouble(ReverseStepTy) : v(_mm512_setr_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) { + _mm512_stream_pd((double*)ptr, a); + } + + static __forceinline vdouble8 loadu(const void* addr) { + return _mm512_loadu_pd((double*)addr); + } + + static __forceinline vdouble8 load(const vdouble8* addr) { + return _mm512_load_pd((double*)addr); + } + + static __forceinline vdouble8 load(const double* addr) { + return _mm512_load_pd(addr); + } + + static __forceinline void store(void* ptr, const vdouble8& v) { + _mm512_store_pd(ptr, v); + } + + static __forceinline void storeu(void* ptr, const vdouble8& v) { + _mm512_storeu_pd(ptr, v); + } + + static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) { + _mm512_mask_storeu_pd(ptr, mask, f); + } + + static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) { + _mm512_mask_store_pd(addr, mask, v2); + } + + /* pass by value to avoid compiler generating inefficient code */ + static __forceinline void storeu_compact(const vboold8 mask,void * addr, const vdouble8& reg) { + _mm512_mask_compressstoreu_pd(addr, mask, reg); + } + + static __forceinline vdouble8 compact64bit(const vboold8& mask, vdouble8& v) { + return _mm512_mask_compress_pd(v, mask, v); + } + + static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) { + return _mm512_mask_compress_pd(v, mask, v); + } + + static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) { + return _mm512_mask_compress_pd(a, mask, b); + } + + static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline double& operator [](size_t index) { assert(index < 8); return i[index]; } + __forceinline const double& operator [](size_t index) const { assert(index < 8); return i[index]; } + + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 asDouble(const vllong8& a) { return _mm512_castsi512_pd(a); } + __forceinline vllong8 asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); } + + __forceinline vdouble8 operator +(const vdouble8& a) { return a; } + __forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); } + __forceinline vdouble8 operator +(const vdouble8& a, double b) { return a + vdouble8(b); } + __forceinline vdouble8 operator +(double a, const vdouble8& b) { return vdouble8(a) + b; } + + __forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); } + __forceinline vdouble8 operator -(const vdouble8& a, double b) { return a - vdouble8(b); } + __forceinline vdouble8 operator -(double a, const vdouble8& b) { return vdouble8(a) - b; } + + __forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); } + __forceinline vdouble8 operator *(const vdouble8& a, double b) { return a * vdouble8(b); } + __forceinline vdouble8 operator *(double a, const vdouble8& b) { return vdouble8(a) * b; } + + __forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); } + __forceinline vdouble8 operator &(const vdouble8& a, double b) { return a & vdouble8(b); } + __forceinline vdouble8 operator &(double a, const vdouble8& b) { return vdouble8(a) & b; } + + __forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); } + __forceinline vdouble8 operator |(const vdouble8& a, double b) { return a | vdouble8(b); } + __forceinline vdouble8 operator |(double a, const vdouble8& b) { return vdouble8(a) | b; } + + __forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); } + __forceinline vdouble8 operator ^(const vdouble8& a, double b) { return a ^ vdouble8(b); } + __forceinline vdouble8 operator ^(double a, const vdouble8& b) { return vdouble8(a) ^ b; } + + __forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); } + __forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); } + + __forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); } + __forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); } + + __forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); } + __forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); } + __forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); } + + __forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); } + __forceinline vdouble8 min(const vdouble8& a, double b) { return min(a,vdouble8(b)); } + __forceinline vdouble8 min(double a, const vdouble8& b) { return min(vdouble8(a),b); } + + __forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); } + __forceinline vdouble8 max(const vdouble8& a, double b) { return max(a,vdouble8(b)); } + __forceinline vdouble8 max(double a, const vdouble8& b) { return max(vdouble8(a),b); } + + __forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); } + __forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); } + + __forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); } + __forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); } + __forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); } + __forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); } + __forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8& operator +=(vdouble8& a, const vdouble8& b) { return a = a + b; } + __forceinline vdouble8& operator +=(vdouble8& a, double b) { return a = a + b; } + + __forceinline vdouble8& operator -=(vdouble8& a, const vdouble8& b) { return a = a - b; } + __forceinline vdouble8& operator -=(vdouble8& a, double b) { return a = a - b; } + + __forceinline vdouble8& operator *=(vdouble8& a, const vdouble8& b) { return a = a * b; } + __forceinline vdouble8& operator *=(vdouble8& a, double b) { return a = a * b; } + + __forceinline vdouble8& operator &=(vdouble8& a, const vdouble8& b) { return a = a & b; } + __forceinline vdouble8& operator &=(vdouble8& a, double b) { return a = a & b; } + + __forceinline vdouble8& operator |=(vdouble8& a, const vdouble8& b) { return a = a | b; } + __forceinline vdouble8& operator |=(vdouble8& a, double b) { return a = a | b; } + + __forceinline vdouble8& operator <<=(vdouble8& a, const double b) { return a = a << b; } + __forceinline vdouble8& operator >>=(vdouble8& a, const double b) { return a = a >> b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 operator ==(const vdouble8& a, double b) { return a == vdouble8(b); } + __forceinline vboold8 operator ==(double a, const vdouble8& b) { return vdouble8(a) == b; } + + __forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 operator !=(const vdouble8& a, double b) { return a != vdouble8(b); } + __forceinline vboold8 operator !=(double a, const vdouble8& b) { return vdouble8(a) != b; } + + __forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 operator < (const vdouble8& a, double b) { return a < vdouble8(b); } + __forceinline vboold8 operator < (double a, const vdouble8& b) { return vdouble8(a) < b; } + + __forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 operator >=(const vdouble8& a, double b) { return a >= vdouble8(b); } + __forceinline vboold8 operator >=(double a, const vdouble8& b) { return vdouble8(a) >= b; } + + __forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 operator > (const vdouble8& a, double b) { return a > vdouble8(b); } + __forceinline vboold8 operator > (double a, const vdouble8& b) { return vdouble8(a) > b; } + + __forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboold8 operator <=(const vdouble8& a, double b) { return a <= vdouble8(b); } + __forceinline vboold8 operator <=(double a, const vdouble8& b) { return vdouble8(a) <= b; } + + __forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); } + + __forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) { + return _mm512_mask_or_pd(f,m,t,t); + } + + __forceinline void xchg(const vboold8& m, vdouble8& a, vdouble8& b) { + const vdouble8 c = a; a = select(m,b,a); b = select(m,c,b); + } + + __forceinline vboold8 test(const vboold8& m, const vdouble8& a, const vdouble8& b) { + return _mm512_mask_test_epi64_mask(m,_mm512_castpd_si512(a),_mm512_castpd_si512(b)); + } + + __forceinline vboold8 test(const vdouble8& a, const vdouble8& b) { + return _mm512_test_epi64_mask(_mm512_castpd_si512(a),_mm512_castpd_si512(b)); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vdouble8 shuffle(const vdouble8& v) { + return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0); + } + + template + __forceinline vdouble8 shuffle(const vdouble8& v) { + return shuffle(v); + } + + template + __forceinline vdouble8 shuffle(const vdouble8& v) { + return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template + __forceinline vdouble8 shuffle4(const vdouble8& v) { + return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2)); + } + + template + __forceinline vdouble8 shuffle4(const vdouble8& v) { + return shuffle4(v); + } + + template + __forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) { + return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i)); + } + + __forceinline double toScalar(const vdouble8& v) { + return _mm_cvtsd_f64(_mm512_castpd512_pd128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 vreduce_add2(vdouble8 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vdouble8 vreduce_add4(vdouble8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vdouble8 vreduce_add (vdouble8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); } + + __forceinline vdouble8 vreduce_min2(vdouble8 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vdouble8 vreduce_min4(vdouble8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vdouble8 vreduce_min (vdouble8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); } + + __forceinline vdouble8 vreduce_max2(vdouble8 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vdouble8 vreduce_max4(vdouble8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vdouble8 vreduce_max (vdouble8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); } + + __forceinline double reduce_add(const vdouble8& v) { return toScalar(vreduce_add(v)); } + __forceinline double reduce_min(const vdouble8& v) { return toScalar(vreduce_min(v)); } + __forceinline double reduce_max(const vdouble8& v) { return toScalar(vreduce_max(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) { + return _mm512_permutexvar_pd(index, v); + } + + __forceinline vdouble8 reverse(const vdouble8& a) { + return permute(a, vllong8(reverse_step)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble8& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<8; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h b/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h new file mode 100644 index 00000000000..aed2419b779 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h @@ -0,0 +1,771 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 16-wide AVX-512 float type */ + template<> + struct vfloat<16> + { + ALIGNED_STRUCT_(64); + + typedef vboolf16 Bool; + typedef vint16 Int; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + union { // data + __m512 v; + float f[16]; + int i[16]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat() {} + __forceinline vfloat(const vfloat16& t) { v = t; } + __forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; } + + __forceinline vfloat(const __m512& t) { v = t; } + __forceinline operator __m512() const { return v; } + __forceinline operator __m256() const { return _mm512_castps512_ps256(v); } + __forceinline operator __m128() const { return _mm512_castps512_ps128(v); } + + __forceinline vfloat(float f) { + v = _mm512_set1_ps(f); + } + + __forceinline vfloat(float a, float b, float c, float d) { + v = _mm512_set4_ps(a, b, c, d); + } + + __forceinline vfloat(const vfloat4& i) { + v = _mm512_broadcast_f32x4(i); + } + + __forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) { + v = _mm512_castps128_ps512(a); + v = _mm512_insertf32x4(v, b, 1); + v = _mm512_insertf32x4(v, c, 2); + v = _mm512_insertf32x4(v, d, 3); + } + + __forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) { + v = _mm512_broadcast_f32x4(a); + v = _mm512_mask_broadcast_f32x4(v,mask,b); + } + + __forceinline vfloat(const vfloat8& i) { + v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i))); + } + + __forceinline vfloat(const vfloat8& a, const vfloat8& b) { + v = _mm512_castps256_ps512(a); +#if defined(__AVX512DQ__) + v = _mm512_insertf32x8(v, b, 1); +#else + v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1)); +#endif + } + + /* WARNING: due to f64x4 the mask is considered as an 8bit mask */ + __forceinline vfloat(const vboolf16& mask, const vfloat8& a, const vfloat8& b) { + __m512d aa = _mm512_broadcast_f64x4(_mm256_castps_pd(a)); + aa = _mm512_mask_broadcast_f64x4(aa,mask,_mm256_castps_pd(b)); + v = _mm512_castpd_ps(aa); + } + + __forceinline explicit vfloat(const vint16& a) { + v = _mm512_cvtepi32_ps(a); + } + + __forceinline explicit vfloat(const vuint16& a) { + v = _mm512_cvtepu32_ps(a); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat(ZeroTy) : v(_mm512_setzero_ps()) {} + __forceinline vfloat(OneTy) : v(_mm512_set1_ps(1.0f)) {} + __forceinline vfloat(PosInfTy) : v(_mm512_set1_ps(pos_inf)) {} + __forceinline vfloat(NegInfTy) : v(_mm512_set1_ps(neg_inf)) {} + __forceinline vfloat(StepTy) : v(_mm512_set_ps(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + __forceinline vfloat(NaNTy) : v(_mm512_set1_ps(nan)) {} + __forceinline vfloat(UndefinedTy) : v(_mm512_undefined_ps()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr); } + static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); } + + static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); } + static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); } + + static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); } + static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); } + + static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); } + static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); } + + static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) { + _mm512_stream_ps((float*)ptr,a); + } + + static __forceinline vfloat16 broadcast(const float* f) { + return _mm512_set1_ps(*f); + } + + static __forceinline vfloat16 compact(const vboolf16& mask, vfloat16 &v) { + return _mm512_mask_compress_ps(v, mask, v); + } + static __forceinline vfloat16 compact(const vboolf16& mask, vfloat16 &a, const vfloat16& b) { + return _mm512_mask_compress_ps(a, mask, b); + } + + static __forceinline vfloat16 expand(const vboolf16& mask, const vfloat16& a, vfloat16& b) { + return _mm512_mask_expand_ps(b, mask, a); + } + + static __forceinline vfloat16 loadu_compact(const vboolf16& mask, const void* ptr) { + return _mm512_mask_expandloadu_ps(_mm512_setzero_ps(), mask, (float*)ptr); + } + + static __forceinline void storeu_compact(const vboolf16& mask, float *addr, const vfloat16 reg) { + _mm512_mask_compressstoreu_ps(addr, mask, reg); + } + + static __forceinline void storeu_compact_single(const vboolf16& mask, float * addr, const vfloat16& reg) { + //_mm512_mask_compressstoreu_ps(addr,mask,reg); + *addr = mm512_cvtss_f32(_mm512_mask_compress_ps(reg, mask, reg)); + } + + template + static __forceinline vfloat16 gather(const float* ptr, const vint16& index) { + return _mm512_i32gather_ps(index, ptr, scale); + } + + template + static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) { + vfloat16 r = zero; + return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale); + } + + template + static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) { + _mm512_i32scatter_ps(ptr, index, v, scale); + } + + template + static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) { + _mm512_mask_i32scatter_ps(ptr, mask, index, v, scale); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float& operator [](size_t index) { assert(index < 16); return f[index]; } + __forceinline const float& operator [](size_t index) const { assert(index < 16); return f[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 asFloat(const vint16& a) { return _mm512_castsi512_ps(a); } + __forceinline vint16 asInt (const vfloat16& a) { return _mm512_castps_si512(a); } + __forceinline vuint16 asUInt (const vfloat16& a) { return _mm512_castps_si512(a); } + + __forceinline vint16 toInt (const vfloat16& a) { return vint16(a); } + __forceinline vfloat16 toFloat(const vint16& a) { return vfloat16(a); } + + __forceinline vfloat16 operator +(const vfloat16& a) { return a; } + __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); } + + __forceinline vfloat16 abs (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); } + __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); } + + __forceinline vfloat16 rcp(const vfloat16& a) { +#if defined(__AVX512ER__) + return _mm512_rcp28_ps(a); +#else + const vfloat16 r = _mm512_rcp14_ps(a); + return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vfloat16(2.0f))); +#endif + } + + __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); } + __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); } + + __forceinline vfloat16 rsqrt(const vfloat16& a) + { +#if defined(__AVX512VL__) + const vfloat16 r = _mm512_rsqrt14_ps(a); + return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r, + _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r))); +#else + return _mm512_rsqrt28_ps(a); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); } + __forceinline vfloat16 operator +(const vfloat16& a, float b) { return a + vfloat16(b); } + __forceinline vfloat16 operator +(float a, const vfloat16& b) { return vfloat16(a) + b; } + + __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); } + __forceinline vfloat16 operator -(const vfloat16& a, float b) { return a - vfloat16(b); } + __forceinline vfloat16 operator -(float a, const vfloat16& b) { return vfloat16(a) - b; } + + __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); } + __forceinline vfloat16 operator *(const vfloat16& a, float b) { return a * vfloat16(b); } + __forceinline vfloat16 operator *(float a, const vfloat16& b) { return vfloat16(a) * b; } + + __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); } + __forceinline vfloat16 operator /(const vfloat16& a, float b) { return a/vfloat16(b); } + __forceinline vfloat16 operator /(float a, const vfloat16& b) { return vfloat16(a)/b; } + + __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); } + __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); } + __forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); + } + + __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) { + return _mm512_min_ps(a,b); + } + __forceinline vfloat16 min(const vfloat16& a, float b) { + return _mm512_min_ps(a,vfloat16(b)); + } + __forceinline vfloat16 min(const float& a, const vfloat16& b) { + return _mm512_min_ps(vfloat16(a),b); + } + + __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) { + return _mm512_max_ps(a,b); + } + __forceinline vfloat16 max(const vfloat16& a, float b) { + return _mm512_max_ps(a,vfloat16(b)); + } + __forceinline vfloat16 max(const float& a, const vfloat16& b) { + return _mm512_max_ps(vfloat16(a),b); + } + + __forceinline vfloat16 mask_add(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) { return _mm512_mask_add_ps (c,mask,a,b); } + __forceinline vfloat16 mask_min(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) { + return _mm512_mask_min_ps(c,mask,a,b); + }; + __forceinline vfloat16 mask_max(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) { + return _mm512_mask_max_ps(c,mask,a,b); + }; + + __forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) { +#if !defined(__AVX512ER__) // SKX + const vint16 ai = _mm512_castps_si512(a); + const vint16 bi = _mm512_castps_si512(b); + const vint16 ci = _mm512_min_epi32(ai,bi); + return _mm512_castsi512_ps(ci); +#else // KNL + return min(a,b); +#endif + } + + __forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) { +#if !defined(__AVX512ER__) // SKX + const vint16 ai = _mm512_castps_si512(a); + const vint16 bi = _mm512_castps_si512(b); + const vint16 ci = _mm512_max_epi32(ai,bi); + return _mm512_castsi512_ps(ci); +#else // KNL + return max(a,b); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); } + __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); } + __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); } + __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); } + + __forceinline vfloat16 mask_msub(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_ps(a,mask,b,c); } + + __forceinline vfloat16 madd231 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(c,b,a); } + __forceinline vfloat16 msub213 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); } + __forceinline vfloat16 msub231 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(c,b,a); } + __forceinline vfloat16 msubr231(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(c,b,a); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Operators with rounding + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 madd_round_down(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_round_ps(a,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 madd_round_up (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_round_ps(a,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + __forceinline vfloat16 mul_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_mul_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 mul_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_mul_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + __forceinline vfloat16 add_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_add_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 add_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_add_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + __forceinline vfloat16 sub_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_sub_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 sub_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_sub_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + __forceinline vfloat16 div_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_div_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 div_round_up (const vfloat16& a, const vfloat16& b) { return _mm512_div_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + __forceinline vfloat16 mask_msub_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 mask_msub_round_up (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + __forceinline vfloat16 mask_mul_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_mul_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 mask_mul_round_up (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_mul_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + __forceinline vfloat16 mask_sub_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_sub_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } + __forceinline vfloat16 mask_sub_round_up (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_sub_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16& operator +=(vfloat16& a, const vfloat16& b) { return a = a + b; } + __forceinline vfloat16& operator +=(vfloat16& a, float b) { return a = a + b; } + + __forceinline vfloat16& operator -=(vfloat16& a, const vfloat16& b) { return a = a - b; } + __forceinline vfloat16& operator -=(vfloat16& a, float b) { return a = a - b; } + + __forceinline vfloat16& operator *=(vfloat16& a, const vfloat16& b) { return a = a * b; } + __forceinline vfloat16& operator *=(vfloat16& a, float b) { return a = a * b; } + + __forceinline vfloat16& operator /=(vfloat16& a, const vfloat16& b) { return a = a / b; } + __forceinline vfloat16& operator /=(vfloat16& a, float b) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 operator ==(const vfloat16& a, float b) { return a == vfloat16(b); } + __forceinline vboolf16 operator ==(float a, const vfloat16& b) { return vfloat16(a) == b; } + + __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 operator !=(const vfloat16& a, float b) { return a != vfloat16(b); } + __forceinline vboolf16 operator !=(float a, const vfloat16& b) { return vfloat16(a) != b; } + + __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 operator < (const vfloat16& a, float b) { return a < vfloat16(b); } + __forceinline vboolf16 operator < (float a, const vfloat16& b) { return vfloat16(a) < b; } + + __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 operator >=(const vfloat16& a, float b) { return a >= vfloat16(b); } + __forceinline vboolf16 operator >=(float a, const vfloat16& b) { return vfloat16(a) >= b; } + + __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 operator > (const vfloat16& a, float b) { return a > vfloat16(b); } + __forceinline vboolf16 operator > (float a, const vfloat16& b) { return vfloat16(a) > b; } + + __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 operator <=(const vfloat16& a, float b) { return a <= vfloat16(b); } + __forceinline vboolf16 operator <=(float a, const vfloat16& b) { return vfloat16(a) <= b; } + + __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); } + + __forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) { + return _mm512_mask_blend_ps(s, f, t); + } + + __forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) { + return madd(t,b-a,a); + } + + __forceinline void xchg(vboolf16 m, vfloat16& a, vfloat16& b) + { + vfloat16 c = a; + a = select(m,b,a); + b = select(m,c,b); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 floor(const vfloat16& a) { + return _mm512_floor_ps(a); + } + __forceinline vfloat16 ceil (const vfloat16& a) { + return _mm512_ceil_ps(a); + } + __forceinline vfloat16 round (const vfloat16& a) { + return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + } + __forceinline vint16 floori (const vfloat16& a) { + return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); } + __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); } + + template + __forceinline vfloat16 shuffle(const vfloat16& v) { + return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); + } + + template + __forceinline vfloat16 shuffle(const vfloat16& v) { + return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template + __forceinline vfloat16 shuffle4(const vfloat16& v) { + return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i)); + } + + template + __forceinline vfloat16 shuffle4(const vfloat16& v) { + return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + __forceinline vfloat16 interleave_even(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_shuffle_epi32(_mm512_castps_si512(a), mm512_int2mask(0xaaaa), _mm512_castps_si512(b), (_MM_PERM_ENUM)0xb1)); + } + + __forceinline vfloat16 interleave_odd(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_shuffle_epi32(_mm512_castps_si512(b), mm512_int2mask(0x5555), _mm512_castps_si512(a), (_MM_PERM_ENUM)0xb1)); + } + + __forceinline vfloat16 interleave2_even(const vfloat16& a, const vfloat16& b) { + /* mask should be 8-bit but is 16-bit to reuse for interleave_even */ + return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xaaaa), _mm512_castps_si512(b), (_MM_PERM_ENUM)0xb1)); + } + + __forceinline vfloat16 interleave2_odd(const vfloat16& a, const vfloat16& b) { + /* mask should be 8-bit but is 16-bit to reuse for interleave_odd */ + return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x5555), _mm512_castps_si512(a), (_MM_PERM_ENUM)0xb1)); + } + + __forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e)); + } + + __forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e)); + } + + __forceinline vfloat16 permute(vfloat16 v, __m512i index) { + return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v))); + } + + __forceinline vfloat16 reverse(const vfloat16& v) { + return permute(v,_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)); + } + + template + __forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i)); + }; + + template + __forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) { + return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i)); + }; + + __forceinline vfloat16 shift_left_1(const vfloat16& a) { + vfloat16 z = zero; + return mask_align_shift_right<15>(0xfffe,z,a,a); + } + + __forceinline vfloat16 shift_right_1(const vfloat16& x) { + return align_shift_right<1>(zero,x); + } + + __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); } + + + template __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); } + + template + vfloat extractN(const vfloat16& v); + + template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v); } + template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); } + template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); } + template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); } + + template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v); } + template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); } + + template __forceinline vfloat4 extract4 (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); } + template<> __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v); } + + template __forceinline vfloat8 extract8 (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); } + template<> __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Transpose + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3) + { +#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL + vfloat16 a0a1_c0c1 = interleave_even(r0, r1); + vfloat16 a2a3_c2c3 = interleave_even(r2, r3); + vfloat16 b0b1_d0d1 = interleave_odd (r0, r1); + vfloat16 b2b3_d2d3 = interleave_odd (r2, r3); + + c0 = interleave2_even(a0a1_c0c1, a2a3_c2c3); + c1 = interleave2_even(b0b1_d0d1, b2b3_d2d3); + c2 = interleave2_odd (a0a1_c0c1, a2a3_c2c3); + c3 = interleave2_odd (b0b1_d0d1, b2b3_d2d3); +#else + vfloat16 a0a2_b0b2 = unpacklo(r0, r2); + vfloat16 c0c2_d0d2 = unpackhi(r0, r2); + vfloat16 a1a3_b1b3 = unpacklo(r1, r3); + vfloat16 c1c3_d1d3 = unpackhi(r1, r3); + + c0 = unpacklo(a0a2_b0b2, a1a3_b1b3); + c1 = unpackhi(a0a2_b0b2, a1a3_b1b3); + c2 = unpacklo(c0c2_d0d2, c1c3_d1d3); + c3 = unpackhi(c0c2_d0d2, c1c3_d1d3); +#endif + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, + const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, + const vfloat4& r8, const vfloat4& r9, const vfloat4& r10, const vfloat4& r11, + const vfloat4& r12, const vfloat4& r13, const vfloat4& r14, const vfloat4& r15, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3) + { + return transpose(vfloat16(r0, r4, r8, r12), vfloat16(r1, r5, r9, r13), vfloat16(r2, r6, r10, r14), vfloat16(r3, r7, r11, r15), + c0, c1, c2, c3); + } + + __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3, + const vfloat16& r4, const vfloat16& r5, const vfloat16& r6, const vfloat16& r7, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3, + vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7) + { + vfloat16 a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3; + transpose(r0, r1, r2, r3, a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3); + + vfloat16 a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7; + transpose(r4, r5, r6, r7, a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7); + + c0 = interleave4_even(a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7); + c1 = interleave4_even(b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7); + c2 = interleave4_even(c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7); + c3 = interleave4_even(d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7); + c4 = interleave4_odd (a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7); + c5 = interleave4_odd (b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7); + c6 = interleave4_odd (c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7); + c7 = interleave4_odd (d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7); + } + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, + const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7, + const vfloat8& r8, const vfloat8& r9, const vfloat8& r10, const vfloat8& r11, + const vfloat8& r12, const vfloat8& r13, const vfloat8& r14, const vfloat8& r15, + vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3, + vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7) + { + return transpose(vfloat16(r0, r8), vfloat16(r1, r9), vfloat16(r2, r10), vfloat16(r3, r11), + vfloat16(r4, r12), vfloat16(r5, r13), vfloat16(r6, r14), vfloat16(r7, r15), + c0, c1, c2, c3, c4, c5, c6, c7); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 vreduce_add2(vfloat16 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vfloat16 vreduce_add4(vfloat16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vfloat16 vreduce_add8(vfloat16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } + __forceinline vfloat16 vreduce_add (vfloat16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } + + __forceinline vfloat16 vreduce_min2(vfloat16 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_min4(vfloat16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vfloat16 vreduce_min8(vfloat16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_min (vfloat16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vfloat16 vreduce_max2(vfloat16 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_max4(vfloat16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vfloat16 vreduce_max8(vfloat16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } + __forceinline vfloat16 vreduce_max (vfloat16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } + + __forceinline float reduce_add(const vfloat16& v) { return toScalar(vreduce_add(v)); } + __forceinline float reduce_min(const vfloat16& v) { return toScalar(vreduce_min(v)); } + __forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); } + + __forceinline size_t select_min(const vfloat16& v) { + return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ))); + } + + __forceinline size_t select_max(const vfloat16& v) { + return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ))); + } + + __forceinline size_t select_min(const vboolf16& valid, const vfloat16& v) + { + const vfloat16 a = select(valid,v,vfloat16(pos_inf)); + const vbool16 valid_min = valid & (a == vreduce_min(a)); + return bsf(movemask(any(valid_min) ? valid_min : valid)); + } + + __forceinline size_t select_max(const vboolf16& valid, const vfloat16& v) + { + const vfloat16 a = select(valid,v,vfloat16(neg_inf)); + const vbool16 valid_max = valid & (a == vreduce_max(a)); + return bsf(movemask(any(valid_max) ? valid_max : valid)); + } + + __forceinline vfloat16 prefix_sum(const vfloat16& a) + { + const vfloat16 z(zero); + vfloat16 v = a; + v = v + align_shift_right<16-1>(v,z); + v = v + align_shift_right<16-2>(v,z); + v = v + align_shift_right<16-4>(v,z); + v = v + align_shift_right<16-8>(v,z); + return v; + } + + __forceinline vfloat16 reverse_prefix_sum(const vfloat16& a) + { + const vfloat16 z(zero); + vfloat16 v = a; + v = v + align_shift_right<1>(z,v); + v = v + align_shift_right<2>(z,v); + v = v + align_shift_right<4>(z,v); + v = v + align_shift_right<8>(z,v); + return v; + } + + __forceinline vfloat16 prefix_min(const vfloat16& a) + { + const vfloat16 z(pos_inf); + vfloat16 v = a; + v = min(v,align_shift_right<16-1>(v,z)); + v = min(v,align_shift_right<16-2>(v,z)); + v = min(v,align_shift_right<16-4>(v,z)); + v = min(v,align_shift_right<16-8>(v,z)); + return v; + } + + __forceinline vfloat16 prefix_max(const vfloat16& a) + { + const vfloat16 z(neg_inf); + vfloat16 v = a; + v = max(v,align_shift_right<16-1>(v,z)); + v = max(v,align_shift_right<16-2>(v,z)); + v = max(v,align_shift_right<16-4>(v,z)); + v = max(v,align_shift_right<16-8>(v,z)); + return v; + } + + + __forceinline vfloat16 reverse_prefix_min(const vfloat16& a) + { + const vfloat16 z(pos_inf); + vfloat16 v = a; + v = min(v,align_shift_right<1>(z,v)); + v = min(v,align_shift_right<2>(z,v)); + v = min(v,align_shift_right<4>(z,v)); + v = min(v,align_shift_right<8>(z,v)); + return v; + } + + __forceinline vfloat16 reverse_prefix_max(const vfloat16& a) + { + const vfloat16 z(neg_inf); + vfloat16 v = a; + v = max(v,align_shift_right<1>(z,v)); + v = max(v,align_shift_right<2>(z,v)); + v = max(v,align_shift_right<4>(z,v)); + v = max(v,align_shift_right<8>(z,v)); + return v; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat16 loadAOS4to16f(const float& x, const float& y, const float& z) + { + vfloat16 f = zero; + f = select(0x1111,vfloat16::broadcast(&x),f); + f = select(0x2222,vfloat16::broadcast(&y),f); + f = select(0x4444,vfloat16::broadcast(&z),f); + return f; + } + + __forceinline vfloat16 loadAOS4to16f(unsigned int index, + const vfloat16& x, + const vfloat16& y, + const vfloat16& z) + { + vfloat16 f = zero; + f = select(0x1111,vfloat16::broadcast((float*)&x + index),f); + f = select(0x2222,vfloat16::broadcast((float*)&y + index),f); + f = select(0x4444,vfloat16::broadcast((float*)&z + index),f); + return f; + } + + __forceinline vfloat16 loadAOS4to16f(unsigned int index, + const vfloat16& x, + const vfloat16& y, + const vfloat16& z, + const vfloat16& fill) + { + vfloat16 f = fill; + f = select(0x1111,vfloat16::broadcast((float*)&x + index),f); + f = select(0x2222,vfloat16::broadcast((float*)&y + index),f); + f = select(0x4444,vfloat16::broadcast((float*)&z + index),f); + return f; + } + + __forceinline vfloat16 rcp_safe(const vfloat16& a) { + return rcp(select(a != vfloat16(zero), a, vfloat16(min_rcp_input))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat16& v) + { + cout << "<" << v[0]; + for (int i=1; i<16; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h b/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h new file mode 100644 index 00000000000..5732c0fbc85 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h @@ -0,0 +1,925 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 4-wide SSE float type */ + template<> + struct vfloat<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128 v; float f[4]; int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat() {} + __forceinline vfloat(const vfloat4& other) { v = other.v; } + __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; } + + __forceinline vfloat(__m128 a) : v(a) {} + __forceinline operator const __m128&() const { return v; } + __forceinline operator __m128&() { return v; } + + __forceinline vfloat(float a) : v(_mm_set1_ps(a)) {} + __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {} + + __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {} +#if defined(__aarch64__) + __forceinline explicit vfloat(const vuint4& x) { + v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v)); + } +#else + __forceinline explicit vfloat(const vuint4& x) { + const __m128i a = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF)); + const __m128i b = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 + const __m128 af = _mm_cvtepi32_ps(a); + const __m128 bf = _mm_castsi128_ps(b); + v = _mm_add_ps(af,bf); + } +#endif + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat(ZeroTy) : v(_mm_setzero_ps()) {} + __forceinline vfloat(OneTy) : v(_mm_set1_ps(1.0f)) {} + __forceinline vfloat(PosInfTy) : v(_mm_set1_ps(pos_inf)) {} + __forceinline vfloat(NegInfTy) : v(_mm_set1_ps(neg_inf)) {} + __forceinline vfloat(StepTy) : v(_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)) {} + __forceinline vfloat(NaNTy) : v(_mm_set1_ps(nan)) {} + __forceinline vfloat(UndefinedTy) : v(_mm_undefined_ps()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vfloat4 load (const void* a) { return _mm_load_ps((float*)a); } + static __forceinline vfloat4 loadu(const void* a) { return _mm_loadu_ps((float*)a); } + + static __forceinline void store (void* ptr, const vfloat4& v) { _mm_store_ps((float*)ptr,v); } + static __forceinline void storeu(void* ptr, const vfloat4& v) { _mm_storeu_ps((float*)ptr,v); } + +#if defined(__AVX512VL__) + + static __forceinline vfloat4 compact(const vboolf4& mask, vfloat4 &v) { + return _mm_mask_compress_ps(v, mask, v); + } + static __forceinline vfloat4 compact(const vboolf4& mask, vfloat4 &a, const vfloat4& b) { + return _mm_mask_compress_ps(a, mask, b); + } + + static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_ps (_mm_setzero_ps(),mask,(float*)ptr); } + static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_ps(_mm_setzero_ps(),mask,(float*)ptr); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_store_ps ((float*)ptr,mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_storeu_ps((float*)ptr,mask,v); } +#elif defined(__AVX__) + static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); } + static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); } +#else + static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_load_ps ((float*)ptr),mask); } + static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_loadu_ps((float*)ptr),mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { store (ptr,select(mask,v,load (ptr))); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { storeu(ptr,select(mask,v,loadu(ptr))); } +#endif + +#if defined(__AVX__) + static __forceinline vfloat4 broadcast(const void* a) { return _mm_broadcast_ss((float*)a); } +#else + static __forceinline vfloat4 broadcast(const void* a) { return _mm_set1_ps(*(float*)a); } +#endif + + static __forceinline vfloat4 load_nt (const float* ptr) { +#if defined (__SSE4_1__) + return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr)); +#else + return _mm_load_ps(ptr); +#endif + } + +#if defined(__aarch64__) + static __forceinline vfloat4 load(const int8_t* ptr) { + return __m128(_mm_load4epi8_f32(((__m128i*)ptr))); + } +#elif defined(__SSE4_1__) + static __forceinline vfloat4 load(const int8_t* ptr) { + return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); + } +#else + static __forceinline vfloat4 load(const int8_t* ptr) { + return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); + } +#endif + +#if defined(__aarch64__) + static __forceinline vfloat4 load(const uint8_t* ptr) { + return __m128(_mm_load4epu8_f32(((__m128i*)ptr))); + } +#elif defined(__SSE4_1__) + static __forceinline vfloat4 load(const uint8_t* ptr) { + return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); + } +#else + static __forceinline vfloat4 load(const uint8_t* ptr) { + //return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions + return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); + } +#endif + +#if defined(__aarch64__) + static __forceinline vfloat4 load(const short* ptr) { + return __m128(_mm_load4epi16_f32(((__m128i*)ptr))); + } +#elif defined(__SSE4_1__) + static __forceinline vfloat4 load(const short* ptr) { + return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); + } +#else + static __forceinline vfloat4 load(const short* ptr) { + return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]); + } +#endif + + static __forceinline vfloat4 load(const unsigned short* ptr) { + return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f)); + } + + static __forceinline void store_nt(void* ptr, const vfloat4& v) + { +#if defined (__SSE4_1__) +#if defined(__aarch64__) + _mm_stream_ps((float*)ptr,vreinterpretq_s32_f32(v.v)); +#else + _mm_stream_ps((float*)ptr,v); +#endif +#else + _mm_store_ps((float*)ptr,v); +#endif + } + + template + static __forceinline vfloat4 gather(const float* ptr, const vint4& index) { +#if defined(__AVX2__) && !defined(__aarch64__) + return _mm_i32gather_ps(ptr, index, scale); +#else + return vfloat4( + *(float*)(((int8_t*)ptr)+scale*index[0]), + *(float*)(((int8_t*)ptr)+scale*index[1]), + *(float*)(((int8_t*)ptr)+scale*index[2]), + *(float*)(((int8_t*)ptr)+scale*index[3])); +#endif + } + + template + static __forceinline vfloat4 gather(const vboolf4& mask, const float* ptr, const vint4& index) { + vfloat4 r = zero; +#if defined(__AVX512VL__) + return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale); +#elif defined(__AVX2__) && !defined(__aarch64__) + return _mm_mask_i32gather_ps(r, ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]); + return r; +#endif + } + + template + static __forceinline void scatter(void* ptr, const vint4& index, const vfloat4& v) + { +#if defined(__AVX512VL__) + _mm_i32scatter_ps((float*)ptr, index, v, scale); +#else + *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0]; + *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1]; + *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2]; + *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3]; +#endif + } + + template + static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vfloat4& v) + { +#if defined(__AVX512VL__) + _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale); +#else + if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0]; + if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1]; + if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2]; + if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3]; +#endif + } + + static __forceinline void store(const vboolf4& mask, int8_t* ptr, const vint4& ofs, const vfloat4& v) { + scatter<1>(mask,ptr,ofs,v); + } + static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) { + scatter<4>(mask,ptr,ofs,v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator [](size_t index) const { assert(index < 4); return f[index]; } + __forceinline float& operator [](size_t index) { assert(index < 4); return f[index]; } + + friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) { +#if defined(__AVX512VL__) + return _mm_mask_blend_ps(m, f, t); +#elif defined(__SSE4_1__) || (defined(__aarch64__)) + return _mm_blendv_ps(f, t, m); +#else + return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); +#endif + } + }; + + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 asFloat(const vint4& a) { return _mm_castsi128_ps(a); } + __forceinline vint4 asInt (const vfloat4& a) { return _mm_castps_si128(a); } + __forceinline vuint4 asUInt (const vfloat4& a) { return _mm_castps_si128(a); } + + __forceinline vint4 toInt (const vfloat4& a) { return vint4(a); } + __forceinline vfloat4 toFloat(const vint4& a) { return vfloat4(a); } + + __forceinline vfloat4 operator +(const vfloat4& a) { return a; } +#if defined(__aarch64__) + __forceinline vfloat4 operator -(const vfloat4& a) { + return vnegq_f32(a); + } +#else + __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } +#endif + +#if defined(__aarch64__) + __forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); } +#else + __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); } +#endif + +#if defined(__AVX512VL__) + __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); } +#else + __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); } +#endif + +#if defined(__aarch64__) + __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a, vreinterpretq_f32_u32(v0x80000000)); } +#else + __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } +#endif + + __forceinline vfloat4 rcp(const vfloat4& a) + { +#if defined(__aarch64__) +#if defined(BUILD_IOS) + return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v)); +#else //BUILD_IOS + __m128 reciprocal = _mm_rcp_ps(a); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + // +1 round since NEON's reciprocal estimate instruction has less accuracy than SSE2's rcp. + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + return (const vfloat4)reciprocal; +#endif // BUILD_IOS +#else + +#if defined(__AVX512VL__) + const vfloat4 r = _mm_rcp14_ps(a); +#else + const vfloat4 r = _mm_rcp_ps(a); +#endif + +#if defined(__AVX2__) + return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); +#else + return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); +#endif + +#endif //defined(__aarch64__) + } + __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); } + __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); } + + __forceinline vfloat4 rsqrt(const vfloat4& a) + { +#if defined(__aarch64__) + vfloat4 r = _mm_rsqrt_ps(a); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); + return r; +#else + +#if defined(__AVX512VL__) + const vfloat4 r = _mm_rsqrt14_ps(a); +#else + const vfloat4 r = _mm_rsqrt_ps(a); +#endif + +#if defined(__AVX2__) + return _mm_fmadd_ps(_mm_set1_ps(1.5f), r, + _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#else + return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), + _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#endif + +#endif + } + + __forceinline vboolf4 isnan(const vfloat4& a) { +#if defined(__aarch64__) + const vfloat4 b = _mm_and_ps(a, vreinterpretq_f32_u32(v0x7fffffff)); +#else + const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); +#endif +#if defined(__AVX512VL__) + return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT); +#else + return _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000))); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 operator +(const vfloat4& a, const vfloat4& b) { return _mm_add_ps(a, b); } + __forceinline vfloat4 operator +(const vfloat4& a, float b) { return a + vfloat4(b); } + __forceinline vfloat4 operator +(float a, const vfloat4& b) { return vfloat4(a) + b; } + + __forceinline vfloat4 operator -(const vfloat4& a, const vfloat4& b) { return _mm_sub_ps(a, b); } + __forceinline vfloat4 operator -(const vfloat4& a, float b) { return a - vfloat4(b); } + __forceinline vfloat4 operator -(float a, const vfloat4& b) { return vfloat4(a) - b; } + + __forceinline vfloat4 operator *(const vfloat4& a, const vfloat4& b) { return _mm_mul_ps(a, b); } + __forceinline vfloat4 operator *(const vfloat4& a, float b) { return a * vfloat4(b); } + __forceinline vfloat4 operator *(float a, const vfloat4& b) { return vfloat4(a) * b; } + + __forceinline vfloat4 operator /(const vfloat4& a, const vfloat4& b) { return _mm_div_ps(a,b); } + __forceinline vfloat4 operator /(const vfloat4& a, float b) { return a/vfloat4(b); } + __forceinline vfloat4 operator /(float a, const vfloat4& b) { return vfloat4(a)/b; } + + __forceinline vfloat4 operator &(const vfloat4& a, const vfloat4& b) { return _mm_and_ps(a,b); } + __forceinline vfloat4 operator |(const vfloat4& a, const vfloat4& b) { return _mm_or_ps(a,b); } + __forceinline vfloat4 operator ^(const vfloat4& a, const vfloat4& b) { return _mm_xor_ps(a,b); } + __forceinline vfloat4 operator ^(const vfloat4& a, const vint4& b) { return _mm_xor_ps(a,_mm_castsi128_ps(b)); } + + __forceinline vfloat4 min(const vfloat4& a, const vfloat4& b) { return _mm_min_ps(a,b); } + __forceinline vfloat4 min(const vfloat4& a, float b) { return _mm_min_ps(a,vfloat4(b)); } + __forceinline vfloat4 min(float a, const vfloat4& b) { return _mm_min_ps(vfloat4(a),b); } + + __forceinline vfloat4 max(const vfloat4& a, const vfloat4& b) { return _mm_max_ps(a,b); } + __forceinline vfloat4 max(const vfloat4& a, float b) { return _mm_max_ps(a,vfloat4(b)); } + __forceinline vfloat4 max(float a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); } + +#if defined(__SSE4_1__) || defined(__aarch64__) + + __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_min_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } + + __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_max_epi32(ai,bi); + return _mm_castsi128_ps(ci); + } + + __forceinline vfloat4 minui(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_min_epu32(ai,bi); + return _mm_castsi128_ps(ci); + } + + __forceinline vfloat4 maxui(const vfloat4& a, const vfloat4& b) { + const vint4 ai = _mm_castps_si128(a); + const vint4 bi = _mm_castps_si128(b); + const vint4 ci = _mm_max_epu32(ai,bi); + return _mm_castsi128_ps(ci); + } +#else + __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { + return min(a,b); + } + + __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) { + return max(a,b); + } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a,b,c); } + __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a,b,c); } + __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); } + __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); } +#else + +#if defined(__aarch64__) + __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { + return _mm_madd_ps(a, b, c); //a*b+c; + } + __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { + return _mm_msub_ps(a, b, c); //-a*b+c; + } + __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { + return vnegq_f32(vfmaq_f32(c,a, b)); + } +#else + __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; } + __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;} + __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; } +#endif + __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; } + +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4& operator +=(vfloat4& a, const vfloat4& b) { return a = a + b; } + __forceinline vfloat4& operator +=(vfloat4& a, float b) { return a = a + b; } + + __forceinline vfloat4& operator -=(vfloat4& a, const vfloat4& b) { return a = a - b; } + __forceinline vfloat4& operator -=(vfloat4& a, float b) { return a = a - b; } + + __forceinline vfloat4& operator *=(vfloat4& a, const vfloat4& b) { return a = a * b; } + __forceinline vfloat4& operator *=(vfloat4& a, float b) { return a = a * b; } + + __forceinline vfloat4& operator /=(vfloat4& a, const vfloat4& b) { return a = a / b; } + __forceinline vfloat4& operator /=(vfloat4& a, float b) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_NE); } + __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LT); } + __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GE); } + __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GT); } + __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); } + __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); } + __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); } +#if defined(__aarch64__) + __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); } + __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); } +#else + __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); } + __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); } +#endif + __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); } +#endif + + __forceinline vboolf4 operator ==(const vfloat4& a, float b) { return a == vfloat4(b); } + __forceinline vboolf4 operator ==(float a, const vfloat4& b) { return vfloat4(a) == b; } + + __forceinline vboolf4 operator !=(const vfloat4& a, float b) { return a != vfloat4(b); } + __forceinline vboolf4 operator !=(float a, const vfloat4& b) { return vfloat4(a) != b; } + + __forceinline vboolf4 operator < (const vfloat4& a, float b) { return a < vfloat4(b); } + __forceinline vboolf4 operator < (float a, const vfloat4& b) { return vfloat4(a) < b; } + + __forceinline vboolf4 operator >=(const vfloat4& a, float b) { return a >= vfloat4(b); } + __forceinline vboolf4 operator >=(float a, const vfloat4& b) { return vfloat4(a) >= b; } + + __forceinline vboolf4 operator > (const vfloat4& a, float b) { return a > vfloat4(b); } + __forceinline vboolf4 operator > (float a, const vfloat4& b) { return vfloat4(a) > b; } + + __forceinline vboolf4 operator <=(const vfloat4& a, float b) { return a <= vfloat4(b); } + __forceinline vboolf4 operator <=(float a, const vfloat4& b) { return vfloat4(a) <= b; } + + __forceinline vboolf4 eq(const vfloat4& a, const vfloat4& b) { return a == b; } + __forceinline vboolf4 ne(const vfloat4& a, const vfloat4& b) { return a != b; } + __forceinline vboolf4 lt(const vfloat4& a, const vfloat4& b) { return a < b; } + __forceinline vboolf4 ge(const vfloat4& a, const vfloat4& b) { return a >= b; } + __forceinline vboolf4 gt(const vfloat4& a, const vfloat4& b) { return a > b; } + __forceinline vboolf4 le(const vfloat4& a, const vfloat4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a == b); } + __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a != b); } + __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a < b); } + __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >= b); } + __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a > b); } + __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <= b); } +#endif + + template + __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f) + { +#if defined(__SSE4_1__) + return _mm_blend_ps(f, t, mask); +#else + return select(vboolf4(mask), t, f); +#endif + } + +#if defined(__aarch64__) + template<> __forceinline vfloat4 select<0>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vzero)); + } + template<> __forceinline vfloat4 select<1>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v000F)); + } + template<> __forceinline vfloat4 select<2>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00F0)); + } + template<> __forceinline vfloat4 select<3>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00FF)); + } + template<> __forceinline vfloat4 select<4>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F00)); + } + template<> __forceinline vfloat4 select<5>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F0F)); + } + template<> __forceinline vfloat4 select<6>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FF0)); + } + template<> __forceinline vfloat4 select<7>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FFF)); + } + template<> __forceinline vfloat4 select<8>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF000)); + } + template<> __forceinline vfloat4 select<9>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF00F)); + } + template<> __forceinline vfloat4 select<10>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0F0)); + } + template<> __forceinline vfloat4 select<11>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0FF)); + } + template<> __forceinline vfloat4 select<12>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF00)); + } + template<> __forceinline vfloat4 select<13>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF0F)); + } + template<> __forceinline vfloat4 select<14>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFF0)); + } + template<> __forceinline vfloat4 select<15>(const vfloat4& t, const vfloat4& f) { + return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFFF)); + } +#endif + + __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) { + return madd(t,b-a,a); + } + + __forceinline bool isvalid(const vfloat4& v) { + return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE))); + } + + __forceinline bool is_finite(const vfloat4& a) { + return all((a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX))); + } + + __forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) { + return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) + __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf + __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf + __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0 + __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn? +#elif defined (__SSE4_1__) + __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } + __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } + __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO ); } + __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } // (even) https://www.felixcloutier.com/x86/roundpd +#else + __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); } + __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); } + __forceinline vfloat4 trunc(const vfloat4& a) { return vfloat4(truncf(a[0]),truncf(a[1]),truncf(a[2]),truncf(a[3])); } + __forceinline vfloat4 round(const vfloat4& a) { return vfloat4(roundf(a[0]),roundf(a[1]),roundf(a[2]),roundf(a[3])); } +#endif + __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); } + + __forceinline vint4 floori(const vfloat4& a) { +#if defined(__aarch64__) + return vcvtq_s32_f32(floor(a)); +#elif defined(__SSE4_1__) + return vint4(floor(a)); +#else + return vint4(a-vfloat4(0.5f)); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); } + __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); } + +#if defined(__aarch64__) + template + __forceinline vfloat4 shuffle(const vfloat4& v) { + return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); + } + template + __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { + return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); + } +#else + template + __forceinline vfloat4 shuffle(const vfloat4& v) { + return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { + return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } +#endif + +#if defined (__SSSE3__) + __forceinline vfloat4 shuffle8(const vfloat4& a, const vint4& shuf) { + return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf)); + } +#endif + +#if defined(__aarch64__) + template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0022 )); } + template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v1133)); } + template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0101)); } +#elif defined(__SSE3__) + template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); } + template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); } + template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); } +#endif + + template + __forceinline vfloat4 shuffle(const vfloat4& v) { + return shuffle(v); + } + +#if defined(__aarch64__) + template __forceinline float extract(const vfloat4& a); + template<> __forceinline float extract<0>(const vfloat4& b) { + return b[0]; + } + template<> __forceinline float extract<1>(const vfloat4& b) { + return b[1]; + } + template<> __forceinline float extract<2>(const vfloat4& b) { + return b[2]; + } + template<> __forceinline float extract<3>(const vfloat4& b) { + return b[3]; + } +#elif defined (__SSE4_1__) && !defined(__GNUC__) + template __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(_mm_extract_ps(a,i)); } + template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); } +#else + template __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(shuffle(a)); } + template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); } +#endif + + +#if defined(__aarch64__) + template __forceinline vfloat4 insert(const vfloat4& a, float b); + template<> __forceinline vfloat4 insert<0>(const vfloat4& a, float b) + { + vfloat4 c = a; + c[0] = b; + return c; + } + template<> __forceinline vfloat4 insert<1>(const vfloat4& a, float b) + { + vfloat4 c = a; + c[1] = b; + return c; + } + template<> __forceinline vfloat4 insert<2>(const vfloat4& a, float b) + { + vfloat4 c = a; + c[2] = b; + return c; + } + template<> __forceinline vfloat4 insert<3>(const vfloat4& a, float b) + { + vfloat4 c = a; + c[3] = b; + return c; + } +#elif defined (__SSE4_1__) + template __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } + template __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert(a, b); } + template __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert(a, _mm_set_ss(b)); } +#else + template __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { vfloat4 c = a; c[dst&3] = b[src&3]; return c; } + template __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; } +#endif + +#if defined(__aarch64__) + __forceinline float toScalar(const vfloat4& v) { + return v[0]; + } +#else + __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); } +#endif + __forceinline vfloat4 broadcast4f(const vfloat4& a, size_t k) { + return vfloat4::broadcast(&a[k]); + } + + __forceinline vfloat4 shift_right_1(const vfloat4& x) { + return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4)); + } + +#if defined (__AVX2__) + __forceinline vfloat4 permute(const vfloat4 &a, const __m128i &index) { + return _mm_permutevar_ps(a,index); + } + + __forceinline vfloat4 broadcast1f(const void* a) { return _mm_broadcast_ss((float*)a); } + +#endif + +#if defined(__AVX512VL__) + template + __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) { + return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i)); + } +#endif + + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting Network + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat4 sort_ascending(const vfloat4& v) + { + const vfloat4 a0 = v; + const vfloat4 b0 = shuffle<1,0,3,2>(a0); + const vfloat4 c0 = min(a0,b0); + const vfloat4 d0 = max(a0,b0); + const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vfloat4 b1 = shuffle<2,3,0,1>(a1); + const vfloat4 c1 = min(a1,b1); + const vfloat4 d1 = max(a1,b1); + const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vfloat4 b2 = shuffle<0,2,1,3>(a2); + const vfloat4 c2 = min(a2,b2); + const vfloat4 d2 = max(a2,b2); + const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + + __forceinline vfloat4 sort_descending(const vfloat4& v) + { + const vfloat4 a0 = v; + const vfloat4 b0 = shuffle<1,0,3,2>(a0); + const vfloat4 c0 = max(a0,b0); + const vfloat4 d0 = min(a0,b0); + const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vfloat4 b1 = shuffle<2,3,0,1>(a1); + const vfloat4 c1 = max(a1,b1); + const vfloat4 d1 = min(a1,b1); + const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vfloat4 b2 = shuffle<0,2,1,3>(a2); + const vfloat4 c2 = max(a2,b2); + const vfloat4 d2 = min(a2,b2); + const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Transpose + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2, vfloat4& c3) + { + vfloat4 l02 = unpacklo(r0,r2); + vfloat4 h02 = unpackhi(r0,r2); + vfloat4 l13 = unpacklo(r1,r3); + vfloat4 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + c3 = unpackhi(h02,h13); + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2) + { + vfloat4 l02 = unpacklo(r0,r2); + vfloat4 h02 = unpackhi(r0,r2); + vfloat4 l13 = unpacklo(r1,r3); + vfloat4 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) + __forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); } + __forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); } + __forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); } +#else + __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } + __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } + __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } +#endif + +#if defined(__aarch64__) + __forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); } + __forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); } + __forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); } +#else + __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); } + __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); } + __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); } +#endif + + __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) + { + const vfloat4 a = select(valid,v,vfloat4(pos_inf)); + const vbool4 valid_min = valid & (a == vreduce_min(a)); + return bsf(movemask(any(valid_min) ? valid_min : valid)); + } + __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v) + { + const vfloat4 a = select(valid,v,vfloat4(neg_inf)); + const vbool4 valid_max = valid & (a == vreduce_max(a)); + return bsf(movemask(any(valid_max) ? valid_max : valid)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline float dot(const vfloat4& a, const vfloat4& b) { + return reduce_add(a*b); + } + + __forceinline vfloat4 cross(const vfloat4& a, const vfloat4& b) + { + const vfloat4 a0 = a; + const vfloat4 b0 = shuffle<1,2,0,3>(b); + const vfloat4 a1 = shuffle<1,2,0,3>(a); + const vfloat4 b1 = b; + return shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } + +} diff --git a/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h b/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h new file mode 100644 index 00000000000..3c7e4a8cdcb --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h @@ -0,0 +1,847 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX float type */ + template<> + struct vfloat<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { __m256 v; float f[8]; int i[8]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat() {} + __forceinline vfloat(const vfloat8& other) { v = other.v; } + __forceinline vfloat8& operator =(const vfloat8& other) { v = other.v; return *this; } + + __forceinline vfloat(__m256 a) : v(a) {} + __forceinline operator const __m256&() const { return v; } + __forceinline operator __m256&() { return v; } + + __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {} + __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {} + + __forceinline explicit vfloat(const int8_t* a) : v(_mm256_loadu_ps((const float*)a)) {} + __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {} + __forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {} + __forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {} + __forceinline vfloat(float a, float b, float c, float d, float e, float f, float g, float h) : v(_mm256_set_ps(h, g, f, e, d, c, b, a)) {} + + __forceinline explicit vfloat(__m256i a) : v(_mm256_cvtepi32_ps(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat(ZeroTy) : v(_mm256_setzero_ps()) {} + __forceinline vfloat(OneTy) : v(_mm256_set1_ps(1.0f)) {} + __forceinline vfloat(PosInfTy) : v(_mm256_set1_ps(pos_inf)) {} + __forceinline vfloat(NegInfTy) : v(_mm256_set1_ps(neg_inf)) {} + __forceinline vfloat(StepTy) : v(_mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)) {} + __forceinline vfloat(NaNTy) : v(_mm256_set1_ps(nan)) {} + __forceinline vfloat(UndefinedTy) : v(_mm256_undefined_ps()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vfloat8 broadcast(const void* a) { + return _mm256_broadcast_ss((float*)a); + } + + static __forceinline vfloat8 broadcast2(const float* a, const float* b) { +#if defined(__INTEL_COMPILER) + const vfloat8 v0 = _mm256_broadcast_ss(a); + const vfloat8 v1 = _mm256_broadcast_ss(b); + return _mm256_blend_ps(v1, v0, 0xf); +#else + return _mm256_set_ps(*b,*b,*b,*b,*a,*a,*a,*a); +#endif + } + + static __forceinline vfloat8 broadcast4f(const vfloat4* ptr) { + return _mm256_broadcast_ps((__m128*)ptr); + } + + static __forceinline vfloat8 load(const int8_t* ptr) { +#if defined(__AVX2__) + return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); +#else + return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); +#endif + } + + static __forceinline vfloat8 load(const uint8_t* ptr) { +#if defined(__AVX2__) + return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); +#else + return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); +#endif + } + + static __forceinline vfloat8 load(const short* ptr) { +#if defined(__AVX2__) + return _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); +#else + return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4)); +#endif + } + + static __forceinline vfloat8 load (const void* ptr) { return _mm256_load_ps((float*)ptr); } + static __forceinline vfloat8 loadu(const void* ptr) { return _mm256_loadu_ps((float*)ptr); } + + static __forceinline void store (void* ptr, const vfloat8& v) { return _mm256_store_ps((float*)ptr,v); } + static __forceinline void storeu(void* ptr, const vfloat8& v) { return _mm256_storeu_ps((float*)ptr,v); } + +#if defined(__AVX512VL__) + + static __forceinline vfloat8 compact(const vboolf8& mask, vfloat8 &v) { + return _mm256_mask_compress_ps(v, mask, v); + } + static __forceinline vfloat8 compact(const vboolf8& mask, vfloat8 &a, const vfloat8& b) { + return _mm256_mask_compress_ps(a, mask, b); + } + + static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_ps (_mm256_setzero_ps(),mask,(float*)ptr); } + static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_ps(_mm256_setzero_ps(),mask,(float*)ptr); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); } +#elif defined(__aarch64__) + static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); } + static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); } +#else + static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } + static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); } +#endif + +#if defined(__AVX2__) + static __forceinline vfloat8 load_nt(void* ptr) { + return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i*)ptr)); + } +#endif + + static __forceinline void store_nt(void* ptr, const vfloat8& v) { + _mm256_stream_ps((float*)ptr,v); + } + + template + static __forceinline vfloat8 gather(const float* ptr, const vint8& index) { +#if defined(__AVX2__) && !defined(__aarch64__) + return _mm256_i32gather_ps(ptr, index ,scale); +#else + return vfloat8( + *(float*)(((int8_t*)ptr)+scale*index[0]), + *(float*)(((int8_t*)ptr)+scale*index[1]), + *(float*)(((int8_t*)ptr)+scale*index[2]), + *(float*)(((int8_t*)ptr)+scale*index[3]), + *(float*)(((int8_t*)ptr)+scale*index[4]), + *(float*)(((int8_t*)ptr)+scale*index[5]), + *(float*)(((int8_t*)ptr)+scale*index[6]), + *(float*)(((int8_t*)ptr)+scale*index[7])); +#endif + } + + template + static __forceinline vfloat8 gather(const vboolf8& mask, const float* ptr, const vint8& index) { + vfloat8 r = zero; +#if defined(__AVX512VL__) + return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale); +#elif defined(__AVX2__) && !defined(__aarch64__) + return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]); + if (likely(mask[4])) r[4] = *(float*)(((int8_t*)ptr)+scale*index[4]); + if (likely(mask[5])) r[5] = *(float*)(((int8_t*)ptr)+scale*index[5]); + if (likely(mask[6])) r[6] = *(float*)(((int8_t*)ptr)+scale*index[6]); + if (likely(mask[7])) r[7] = *(float*)(((int8_t*)ptr)+scale*index[7]); + return r; + #endif + } + + template + static __forceinline void scatter(void* ptr, const vint8& ofs, const vfloat8& v) + { +#if defined(__AVX512VL__) + _mm256_i32scatter_ps((float*)ptr, ofs, v, scale); +#else + *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + template + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vfloat8& v) + { +#if defined(__AVX512VL__) + _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale); +#else + if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + static __forceinline void store(const vboolf8& mask, int8_t* ptr, const vint8& ofs, const vfloat8& v) { + scatter<1>(mask,ptr,ofs,v); + } + static __forceinline void store(const vboolf8& mask, float* ptr, const vint8& ofs, const vfloat8& v) { + scatter<4>(mask,ptr,ofs,v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const float& operator [](size_t index) const { assert(index < 8); return f[index]; } + __forceinline float& operator [](size_t index) { assert(index < 8); return f[index]; } + }; + + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 asFloat(const vint8& a) { return _mm256_castsi256_ps(a); } + __forceinline vint8 asInt (const vfloat8& a) { return _mm256_castps_si256(a); } + + __forceinline vint8 toInt (const vfloat8& a) { return vint8(a); } + __forceinline vfloat8 toFloat(const vint8& a) { return vfloat8(a); } + + __forceinline vfloat8 operator +(const vfloat8& a) { return a; } +#if !defined(__aarch64__) + __forceinline vfloat8 operator -(const vfloat8& a) { + const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); + return _mm256_xor_ps(a, mask); + } +#else + __forceinline vfloat8 operator -(const vfloat8& a) { + __m256 res; + res.lo = vnegq_f32(a.v.lo); + res.hi = vnegq_f32(a.v.hi); + return res; +} +#endif + +#if !defined(__aarch64__) +__forceinline vfloat8 abs(const vfloat8& a) { + const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)); + return _mm256_and_ps(a, mask); +} +#else +__forceinline vfloat8 abs(const vfloat8& a) { + __m256 res; + res.lo = vabsq_f32(a.v.lo); + res.hi = vabsq_f32(a.v.hi); + return res; +} +#endif + +#if !defined(__aarch64__) + __forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); } +#else + __forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmplt_ps(a, vfloat8(zero))); } +#endif + __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); } + + + static __forceinline vfloat8 rcp(const vfloat8& a) + { +#if defined(BUILD_IOS) && defined(__aarch64__) + // ios devices are faster doing full divide, no need for NR fixup + vfloat8 ret; + const float32x4_t one = vdupq_n_f32(1.0f); + ret.v.lo = vdivq_f32(one, a.v.lo); + ret.v.hi = vdivq_f32(one, a.v.hi); + return ret; +#endif + +#if defined(__AVX512VL__) + const vfloat8 r = _mm256_rcp14_ps(a); +#else + const vfloat8 r = _mm256_rcp_ps(a); +#endif + +#if defined(__AVX2__) //&& !defined(aarch64) + return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f))); +#else + return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a))); +#endif + } + __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); } + __forceinline vfloat8 sqrt(const vfloat8& a) { return _mm256_sqrt_ps(a); } + + static __forceinline vfloat8 rsqrt(const vfloat8& a) + { +#if defined(__AVX512VL__) + const vfloat8 r = _mm256_rsqrt14_ps(a); +#else + const vfloat8 r = _mm256_rsqrt_ps(a); +#endif + +#if defined(__AVX2__) + return _mm256_fmadd_ps(_mm256_set1_ps(1.5f), r, + _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); +#else + return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r), + _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 operator +(const vfloat8& a, const vfloat8& b) { return _mm256_add_ps(a, b); } + __forceinline vfloat8 operator +(const vfloat8& a, float b) { return a + vfloat8(b); } + __forceinline vfloat8 operator +(float a, const vfloat8& b) { return vfloat8(a) + b; } + + __forceinline vfloat8 operator -(const vfloat8& a, const vfloat8& b) { return _mm256_sub_ps(a, b); } + __forceinline vfloat8 operator -(const vfloat8& a, float b) { return a - vfloat8(b); } + __forceinline vfloat8 operator -(float a, const vfloat8& b) { return vfloat8(a) - b; } + + __forceinline vfloat8 operator *(const vfloat8& a, const vfloat8& b) { return _mm256_mul_ps(a, b); } + __forceinline vfloat8 operator *(const vfloat8& a, float b) { return a * vfloat8(b); } + __forceinline vfloat8 operator *(float a, const vfloat8& b) { return vfloat8(a) * b; } + + __forceinline vfloat8 operator /(const vfloat8& a, const vfloat8& b) { return _mm256_div_ps(a, b); } + __forceinline vfloat8 operator /(const vfloat8& a, float b) { return a / vfloat8(b); } + __forceinline vfloat8 operator /(float a, const vfloat8& b) { return vfloat8(a) / b; } + + __forceinline vfloat8 operator &(const vfloat8& a, const vfloat8& b) { return _mm256_and_ps(a,b); } + __forceinline vfloat8 operator |(const vfloat8& a, const vfloat8& b) { return _mm256_or_ps(a,b); } + __forceinline vfloat8 operator ^(const vfloat8& a, const vfloat8& b) { return _mm256_xor_ps(a,b); } + __forceinline vfloat8 operator ^(const vfloat8& a, const vint8& b) { return _mm256_xor_ps(a,_mm256_castsi256_ps(b)); } + + __forceinline vfloat8 min(const vfloat8& a, const vfloat8& b) { return _mm256_min_ps(a, b); } + __forceinline vfloat8 min(const vfloat8& a, float b) { return _mm256_min_ps(a, vfloat8(b)); } + __forceinline vfloat8 min(float a, const vfloat8& b) { return _mm256_min_ps(vfloat8(a), b); } + + __forceinline vfloat8 max(const vfloat8& a, const vfloat8& b) { return _mm256_max_ps(a, b); } + __forceinline vfloat8 max(const vfloat8& a, float b) { return _mm256_max_ps(a, vfloat8(b)); } + __forceinline vfloat8 max(float a, const vfloat8& b) { return _mm256_max_ps(vfloat8(a), b); } + + /* need "static __forceinline for MSVC, otherwise we'll link the wrong version in debug mode */ +#if defined(__AVX2__) + + static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_min_epi32(ai,bi); + return _mm256_castsi256_ps(ci); + } + + static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_max_epi32(ai,bi); + return _mm256_castsi256_ps(ci); + } + + static __forceinline vfloat8 minui(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_min_epu32(ai,bi); + return _mm256_castsi256_ps(ci); + } + + static __forceinline vfloat8 maxui(const vfloat8& a, const vfloat8& b) { + const vint8 ai = _mm256_castps_si256(a); + const vint8 bi = _mm256_castps_si256(b); + const vint8 ci = _mm256_max_epu32(ai,bi); + return _mm256_castsi256_ps(ci); + } + +#else + + static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) { + return asFloat(min(asInt(a),asInt(b))); + } + + static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) { + return asFloat(max(asInt(a),asInt(b))); + } + +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Ternary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX2__) + static __forceinline vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmadd_ps(a,b,c); } + static __forceinline vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmsub_ps(a,b,c); } + static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmadd_ps(a,b,c); } + static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmsub_ps(a,b,c); } +#else + static __forceinline vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b+c; } + static __forceinline vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b-c; } + static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b+c;} + static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b-c; } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8& operator +=(vfloat8& a, const vfloat8& b) { return a = a + b; } + __forceinline vfloat8& operator +=(vfloat8& a, float b) { return a = a + b; } + + __forceinline vfloat8& operator -=(vfloat8& a, const vfloat8& b) { return a = a - b; } + __forceinline vfloat8& operator -=(vfloat8& a, float b) { return a = a - b; } + + __forceinline vfloat8& operator *=(vfloat8& a, const vfloat8& b) { return a = a * b; } + __forceinline vfloat8& operator *=(vfloat8& a, float b) { return a = a * b; } + + __forceinline vfloat8& operator /=(vfloat8& a, const vfloat8& b) { return a = a / b; } + __forceinline vfloat8& operator /=(vfloat8& a, float b) { return a = a / b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_EQ); } + static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_NE); } + static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LT); } + static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GE); } + static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GT); } + static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LE); } + + static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { + return _mm256_mask_blend_ps(m, f, t); + } +#elif !defined(__aarch64__) + __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } + __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); } + __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); } + __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } + __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } + __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); } + + __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { + return _mm256_blendv_ps(f, t, m); + } +#else + __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a, b); } + __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a, b); } + __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a, b); } + __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a, b); } + __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a, b); } + __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a, b); } + + __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { + return _mm256_blendv_ps(f, t, m); + } + +#endif + + template + __forceinline vfloat8 select(const vfloat8& t, const vfloat8& f) { + return _mm256_blend_ps(f, t, mask); + } + + __forceinline vboolf8 operator ==(const vfloat8& a, const float& b) { return a == vfloat8(b); } + __forceinline vboolf8 operator ==(const float& a, const vfloat8& b) { return vfloat8(a) == b; } + + __forceinline vboolf8 operator !=(const vfloat8& a, const float& b) { return a != vfloat8(b); } + __forceinline vboolf8 operator !=(const float& a, const vfloat8& b) { return vfloat8(a) != b; } + + __forceinline vboolf8 operator < (const vfloat8& a, const float& b) { return a < vfloat8(b); } + __forceinline vboolf8 operator < (const float& a, const vfloat8& b) { return vfloat8(a) < b; } + + __forceinline vboolf8 operator >=(const vfloat8& a, const float& b) { return a >= vfloat8(b); } + __forceinline vboolf8 operator >=(const float& a, const vfloat8& b) { return vfloat8(a) >= b; } + + __forceinline vboolf8 operator > (const vfloat8& a, const float& b) { return a > vfloat8(b); } + __forceinline vboolf8 operator > (const float& a, const vfloat8& b) { return vfloat8(a) > b; } + + __forceinline vboolf8 operator <=(const vfloat8& a, const float& b) { return a <= vfloat8(b); } + __forceinline vboolf8 operator <=(const float& a, const vfloat8& b) { return vfloat8(a) <= b; } + + __forceinline vboolf8 eq(const vfloat8& a, const vfloat8& b) { return a == b; } + __forceinline vboolf8 ne(const vfloat8& a, const vfloat8& b) { return a != b; } + __forceinline vboolf8 lt(const vfloat8& a, const vfloat8& b) { return a < b; } + __forceinline vboolf8 ge(const vfloat8& a, const vfloat8& b) { return a >= b; } + __forceinline vboolf8 gt(const vfloat8& a, const vfloat8& b) { return a > b; } + __forceinline vboolf8 le(const vfloat8& a, const vfloat8& b) { return a <= b; } + +#if defined(__AVX512VL__) + static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); } + static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); } +#else + static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a == b); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a != b); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a < b); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >= b); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a > b); } + static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <= b); } +#endif + + __forceinline vfloat8 lerp(const vfloat8& a, const vfloat8& b, const vfloat8& t) { + return madd(t,b-a,a); + } + + __forceinline bool isvalid (const vfloat8& v) { + return all((v > vfloat8(-FLT_LARGE)) & (v < vfloat8(+FLT_LARGE))); + } + + __forceinline bool is_finite (const vfloat8& a) { + return all((a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX))); + } + + __forceinline bool is_finite (const vboolf8& valid, const vfloat8& a) { + return all(valid, (a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX))); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Rounding Functions + //////////////////////////////////////////////////////////////////////////////// + +#if !defined(__aarch64__) + __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); } + __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); } + __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO ); } + __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } +#else + __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a); } + __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a); } +#endif + + + __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 unpacklo(const vfloat8& a, const vfloat8& b) { return _mm256_unpacklo_ps(a, b); } + __forceinline vfloat8 unpackhi(const vfloat8& a, const vfloat8& b) { return _mm256_unpackhi_ps(a, b); } + + template + __forceinline vfloat8 shuffle(const vfloat8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i)); + } + + template + __forceinline vfloat8 shuffle4(const vfloat8& v) { + return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vfloat8 shuffle4(const vfloat8& a, const vfloat8& b) { + return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vfloat8 shuffle(const vfloat8& v) { + return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template + __forceinline vfloat8 shuffle(const vfloat8& a, const vfloat8& b) { + return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); + } + +#if !defined(__aarch64__) + template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); } + template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); } + template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); } +#endif + + __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); } + template __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); } + template __forceinline vfloat4 extract4 (const vfloat8& a) { return _mm256_extractf128_ps(a, i); } + template<> __forceinline vfloat4 extract4<0>(const vfloat8& a) { return _mm256_castps256_ps128(a); } + + __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); } + + __forceinline vfloat8 assign(const vfloat4& a) { return _mm256_castps128_ps256(a); } + +#if defined (__AVX2__) && !defined(__aarch64__) + __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) { + return _mm256_permutevar8x32_ps(a, index); + } +#endif + +#if defined(__AVX512VL__) + template + static __forceinline vfloat8 align_shift_right(const vfloat8& a, const vfloat8& b) { + return _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(a), _mm256_castps_si256(b), i)); + } +#endif + +#if defined (__AVX_I__) + template + static __forceinline vint4 convert_to_hf16(const vfloat8& a) { + return _mm256_cvtps_ph(a, mode); + } + + static __forceinline vfloat8 convert_from_hf16(const vint4& a) { + return _mm256_cvtph_ps(a); + } +#endif + + __forceinline vfloat4 broadcast4f(const vfloat8& a, const size_t k) { + return vfloat4::broadcast(&a[k]); + } + + __forceinline vfloat8 broadcast8f(const vfloat8& a, const size_t k) { + return vfloat8::broadcast(&a[k]); + } + +#if defined(__AVX512VL__) + static __forceinline vfloat8 shift_right_1(const vfloat8& x) { + return align_shift_right<1>(zero,x); + } +#else + static __forceinline vfloat8 shift_right_1(const vfloat8& x) { + const vfloat8 t0 = shuffle<1,2,3,0>(x); + const vfloat8 t1 = shuffle4<1,0>(t0); + return _mm256_blend_ps(t0,t1,0x88); + } +#endif + + __forceinline vint8 floori(const vfloat8& a) { + return vint8(floor(a)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Transpose + //////////////////////////////////////////////////////////////////////////////// + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3) + { + vfloat8 l02 = unpacklo(r0,r2); + vfloat8 h02 = unpackhi(r0,r2); + vfloat8 l13 = unpacklo(r1,r3); + vfloat8 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + c3 = unpackhi(h02,h13); + } + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2) + { + vfloat8 l02 = unpacklo(r0,r2); + vfloat8 h02 = unpackhi(r0,r2); + vfloat8 l13 = unpacklo(r1,r3); + vfloat8 h13 = unpackhi(r1,r3); + c0 = unpacklo(l02,l13); + c1 = unpackhi(l02,l13); + c2 = unpacklo(h02,h13); + } + + __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7, + vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3, vfloat8& c4, vfloat8& c5, vfloat8& c6, vfloat8& c7) + { + vfloat8 h0,h1,h2,h3; transpose(r0,r1,r2,r3,h0,h1,h2,h3); + vfloat8 h4,h5,h6,h7; transpose(r4,r5,r6,r7,h4,h5,h6,h7); + c0 = shuffle4<0,2>(h0,h4); + c1 = shuffle4<0,2>(h1,h5); + c2 = shuffle4<0,2>(h2,h6); + c3 = shuffle4<0,2>(h3,h7); + c4 = shuffle4<1,3>(h0,h4); + c5 = shuffle4<1,3>(h1,h5); + c6 = shuffle4<1,3>(h2,h6); + c7 = shuffle4<1,3>(h3,h7); + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, + vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3) + { + transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2, c3); + } + + __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7, + vfloat8& c0, vfloat8& c1, vfloat8& c2) + { + transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// +#if !defined(__aarch64__) + __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); } + __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + __forceinline vfloat8 vreduce_max2(const vfloat8& v) { return max(v,shuffle<1,0,3,2>(v)); } + __forceinline vfloat8 vreduce_max4(const vfloat8& v) { vfloat8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vfloat8 vreduce_max (const vfloat8& v) { vfloat8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vfloat8 vreduce_add2(const vfloat8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vfloat8 vreduce_add4(const vfloat8& v) { vfloat8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vfloat8 vreduce_add (const vfloat8& v) { vfloat8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); } + __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); } + __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); } +#else + __forceinline float reduce_min(const vfloat8& v) { return vminvq_f32(_mm_min_ps(v.v.lo,v.v.hi)); } + __forceinline float reduce_max(const vfloat8& v) { return vmaxvq_f32(_mm_max_ps(v.v.lo,v.v.hi)); } + __forceinline vfloat8 vreduce_min(const vfloat8& v) { return vfloat8(reduce_min(v)); } + __forceinline vfloat8 vreduce_max(const vfloat8& v) { return vfloat8(reduce_max(v)); } + __forceinline float reduce_add(const vfloat8& v) { return vaddvq_f32(_mm_add_ps(v.v.lo,v.v.hi)); } + +#endif + __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) + { + const vfloat8 a = select(valid,v,vfloat8(pos_inf)); + const vbool8 valid_min = valid & (a == vreduce_min(a)); + return bsf(movemask(any(valid_min) ? valid_min : valid)); + } + + __forceinline size_t select_max(const vboolf8& valid, const vfloat8& v) + { + const vfloat8 a = select(valid,v,vfloat8(neg_inf)); + const vbool8 valid_max = valid & (a == vreduce_max(a)); + return bsf(movemask(any(valid_max) ? valid_max : valid)); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Euclidian Space Operators (pairs of Vec3fa's) + //////////////////////////////////////////////////////////////////////////////// + + //__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) { + // return vreduce_add4(a*b); + //} + + __forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) { + return _mm256_dp_ps(a,b,0x7F); + } + + __forceinline vfloat8 cross(const vfloat8& a, const vfloat8& b) + { + const vfloat8 a0 = a; + const vfloat8 b0 = shuffle<1,2,0,3>(b); + const vfloat8 a1 = shuffle<1,2,0,3>(a); + const vfloat8 b1 = b; + return shuffle<1,2,0,3>(msub(a0,b0,a1*b1)); + } + + //__forceinline float sqr_length (const vfloat<8>& a) { return dot(a,a); } + //__forceinline float rcp_length (const vfloat<8>& a) { return rsqrt(dot(a,a)); } + //__forceinline float rcp_length2(const vfloat<8>& a) { return rcp(dot(a,a)); } + //__forceinline float length (const vfloat<8>& a) { return sqrt(dot(a,a)); } + __forceinline vfloat<8> normalize(const vfloat<8>& a) { return a*rsqrt(dot(a,a)); } + //__forceinline float distance(const vfloat<8>& a, const vfloat<8>& b) { return length(a-b); } + //__forceinline float halfArea(const vfloat<8>& d) { return madd(d.x,(d.y+d.z),d.y*d.z); } + //__forceinline float area (const vfloat<8>& d) { return 2.0f*halfArea(d); } + //__forceinline vfloat<8> reflect(const vfloat<8>& V, const vfloat<8>& N) { return 2.0f*dot(V,N)*N-V; } + + //__forceinline vfloat<8> normalize_safe(const vfloat<8>& a) { + // const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); + //} + + //////////////////////////////////////////////////////////////////////////////// + /// In Register Sorting + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vfloat8 sort_ascending(const vfloat8& v) + { + const vfloat8 a0 = v; + const vfloat8 b0 = shuffle<1,0,3,2>(a0); + const vfloat8 c0 = min(a0,b0); + const vfloat8 d0 = max(a0,b0); + const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vfloat8 b1 = shuffle<2,3,0,1>(a1); + const vfloat8 c1 = min(a1,b1); + const vfloat8 d1 = max(a1,b1); + const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vfloat8 b2 = shuffle<1,0,3,2>(a2); + const vfloat8 c2 = min(a2,b2); + const vfloat8 d2 = max(a2,b2); + const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vfloat8 b3 = shuffle4<1,0>(a3); + const vfloat8 c3 = min(a3,b3); + const vfloat8 d3 = max(a3,b3); + const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vfloat8 b4 = shuffle<2,3,0,1>(a4); + const vfloat8 c4 = min(a4,b4); + const vfloat8 d4 = max(a4,b4); + const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vfloat8 b5 = shuffle<1,0,3,2>(a5); + const vfloat8 c5 = min(a5,b5); + const vfloat8 d5 = max(a5,b5); + const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + __forceinline vfloat8 sort_descending(const vfloat8& v) + { + const vfloat8 a0 = v; + const vfloat8 b0 = shuffle<1,0,3,2>(a0); + const vfloat8 c0 = max(a0,b0); + const vfloat8 d0 = min(a0,b0); + const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vfloat8 b1 = shuffle<2,3,0,1>(a1); + const vfloat8 c1 = max(a1,b1); + const vfloat8 d1 = min(a1,b1); + const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vfloat8 b2 = shuffle<1,0,3,2>(a2); + const vfloat8 c2 = max(a2,b2); + const vfloat8 d2 = min(a2,b2); + const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vfloat8 b3 = shuffle4<1,0>(a3); + const vfloat8 c3 = max(a3,b3); + const vfloat8 d3 = min(a3,b3); + const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vfloat8 b4 = shuffle<2,3,0,1>(a4); + const vfloat8 c4 = max(a4,b4); + const vfloat8 d4 = min(a4,b4); + const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vfloat8 b5 = shuffle<1,0,3,2>(a5); + const vfloat8 c5 = max(a5,b5); + const vfloat8 d5 = min(a5,b5); + const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vint16_avx512.h b/thirdparty/embree-aarch64/common/simd/vint16_avx512.h new file mode 100644 index 00000000000..3249bc2b455 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vint16_avx512.h @@ -0,0 +1,490 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 16-wide AVX-512 integer type */ + template<> + struct vint<16> + { + ALIGNED_STRUCT_(64); + + typedef vboolf16 Bool; + typedef vint16 Int; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + union { // data + __m512i v; + int i[16]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint16& t) { v = t.v; } + __forceinline vint16& operator =(const vint16& f) { v = f.v; return *this; } + + __forceinline vint(const __m512i& t) { v = t; } + __forceinline operator __m512i() const { return v; } + __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } + + __forceinline vint(int i) { + v = _mm512_set1_epi32(i); + } + + __forceinline vint(int a, int b, int c, int d) { + v = _mm512_set4_epi32(d,c,b,a); + } + + __forceinline vint(int a0 , int a1 , int a2 , int a3, + int a4 , int a5 , int a6 , int a7, + int a8 , int a9 , int a10, int a11, + int a12, int a13, int a14, int a15) + { + v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0); + } + + __forceinline vint(const vint4& i) { + v = _mm512_broadcast_i32x4(i); + } + + __forceinline vint(const vint4& a, const vint4& b, const vint4& c, const vint4& d) { + v = _mm512_castsi128_si512(a); + v = _mm512_inserti32x4(v, b, 1); + v = _mm512_inserti32x4(v, c, 2); + v = _mm512_inserti32x4(v, d, 3); + } + + __forceinline vint(const vint8& i) { + v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i)))); + } + + __forceinline vint(const vint8& a, const vint8& b) { + v = _mm512_castsi256_si512(a); + v = _mm512_inserti64x4(v, b, 1); + } + + __forceinline explicit vint(const __m512& f) { + v = _mm512_cvtps_epi32(f); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm512_setzero_epi32()) {} + __forceinline vint(OneTy) : v(_mm512_set1_epi32(1)) {} + __forceinline vint(PosInfTy) : v(_mm512_set1_epi32(pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm512_set1_epi32(neg_inf)) {} + __forceinline vint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + __forceinline vint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); } + + static __forceinline vint16 load(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); } + static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); } + + static __forceinline vint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); } + static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); } + + static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); } + + static __forceinline vint16 load (const vboolf16& mask, const void* addr) { return _mm512_mask_load_epi32 (_mm512_setzero_epi32(),mask,addr); } + static __forceinline vint16 loadu(const vboolf16& mask, const void* addr) { return _mm512_mask_loadu_epi32(_mm512_setzero_epi32(),mask,addr); } + + static __forceinline void store (void* ptr, const vint16& v) { _mm512_store_si512 (ptr,v); } + static __forceinline void storeu(void* ptr, const vint16& v) { _mm512_storeu_si512(ptr,v); } + + static __forceinline void store (const vboolf16& mask, void* addr, const vint16& v2) { _mm512_mask_store_epi32(addr,mask,v2); } + static __forceinline void storeu(const vboolf16& mask, void* ptr, const vint16& f) { _mm512_mask_storeu_epi32((int*)ptr,mask,f); } + + static __forceinline void store_nt(void* __restrict__ ptr, const vint16& a) { _mm512_stream_si512((__m512i*)ptr,a); } + + /* pass by value to avoid compiler generating inefficient code */ + static __forceinline void storeu_compact(const vboolf16 mask, void* addr, vint16 reg) { + _mm512_mask_compressstoreu_epi32(addr,mask,reg); + } + + static __forceinline void storeu_compact_single(const vboolf16 mask, void* addr, vint16 reg) { + //_mm512_mask_compressstoreu_epi32(addr,mask,reg); + *(float*)addr = mm512_cvtss_f32(_mm512_mask_compress_ps(_mm512_castsi512_ps(reg),mask,_mm512_castsi512_ps(reg))); + } + + static __forceinline vint16 compact64bit(const vboolf16& mask, vint16 &v) { + return _mm512_mask_compress_epi64(v,mask,v); + } + + static __forceinline vint16 compact(const vboolf16& mask, vint16 &v) { + return _mm512_mask_compress_epi32(v,mask,v); + } + + static __forceinline vint16 compact(const vboolf16& mask, const vint16 &a, vint16 &b) { + return _mm512_mask_compress_epi32(a,mask,b); + } + + static __forceinline vint16 expand(const vboolf16& mask, const vint16& a, vint16& b) { + return _mm512_mask_expand_epi32(b,mask,a); + } + + template + static __forceinline vint16 gather(const int* ptr, const vint16& index) { + return _mm512_i32gather_epi32(index,ptr,scale); + } + + template + static __forceinline vint16 gather(const vboolf16& mask, const int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale); + } + + template + static __forceinline vint16 gather(const vboolf16& mask, vint16& dest, const int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale); + } + + template + static __forceinline void scatter(int* ptr, const vint16& index, const vint16& v) { + _mm512_i32scatter_epi32((int*)ptr,index,v,scale); + } + + template + static __forceinline void scatter(const vboolf16& mask, int* ptr, const vint16& index, const vint16& v) { + _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale); + } + + static __forceinline vint16 broadcast64bit(size_t v) { + return _mm512_set1_epi64(v); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline int& operator [](size_t index) { assert(index < 16); return i[index]; } + __forceinline const int& operator [](size_t index) const { assert(index < 16); return i[index]; } + + __forceinline unsigned int uint (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; } + __forceinline size_t& uint64_t(size_t index) const { assert(index < 8); return ((size_t*)i)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 asBool(const vint16& a) { return _mm512_movepi32_mask(a); } + + __forceinline vint16 operator +(const vint16& a) { return a; } + __forceinline vint16 operator -(const vint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 operator +(const vint16& a, const vint16& b) { return _mm512_add_epi32(a, b); } + __forceinline vint16 operator +(const vint16& a, int b) { return a + vint16(b); } + __forceinline vint16 operator +(int a, const vint16& b) { return vint16(a) + b; } + + __forceinline vint16 operator -(const vint16& a, const vint16& b) { return _mm512_sub_epi32(a, b); } + __forceinline vint16 operator -(const vint16& a, int b) { return a - vint16(b); } + __forceinline vint16 operator -(int a, const vint16& b) { return vint16(a) - b; } + + __forceinline vint16 operator *(const vint16& a, const vint16& b) { return _mm512_mullo_epi32(a, b); } + __forceinline vint16 operator *(const vint16& a, int b) { return a * vint16(b); } + __forceinline vint16 operator *(int a, const vint16& b) { return vint16(a) * b; } + + __forceinline vint16 operator &(const vint16& a, const vint16& b) { return _mm512_and_epi32(a, b); } + __forceinline vint16 operator &(const vint16& a, int b) { return a & vint16(b); } + __forceinline vint16 operator &(int a, const vint16& b) { return vint16(a) & b; } + + __forceinline vint16 operator |(const vint16& a, const vint16& b) { return _mm512_or_epi32(a, b); } + __forceinline vint16 operator |(const vint16& a, int b) { return a | vint16(b); } + __forceinline vint16 operator |(int a, const vint16& b) { return vint16(a) | b; } + + __forceinline vint16 operator ^(const vint16& a, const vint16& b) { return _mm512_xor_epi32(a, b); } + __forceinline vint16 operator ^(const vint16& a, int b) { return a ^ vint16(b); } + __forceinline vint16 operator ^(int a, const vint16& b) { return vint16(a) ^ b; } + + __forceinline vint16 operator <<(const vint16& a, int n) { return _mm512_slli_epi32(a, n); } + __forceinline vint16 operator >>(const vint16& a, int n) { return _mm512_srai_epi32(a, n); } + + __forceinline vint16 operator <<(const vint16& a, const vint16& n) { return _mm512_sllv_epi32(a, n); } + __forceinline vint16 operator >>(const vint16& a, const vint16& n) { return _mm512_srav_epi32(a, n); } + + __forceinline vint16 sll (const vint16& a, int b) { return _mm512_slli_epi32(a, b); } + __forceinline vint16 sra (const vint16& a, int b) { return _mm512_srai_epi32(a, b); } + __forceinline vint16 srl (const vint16& a, int b) { return _mm512_srli_epi32(a, b); } + + __forceinline vint16 min(const vint16& a, const vint16& b) { return _mm512_min_epi32(a, b); } + __forceinline vint16 min(const vint16& a, int b) { return min(a,vint16(b)); } + __forceinline vint16 min(int a, const vint16& b) { return min(vint16(a),b); } + + __forceinline vint16 max(const vint16& a, const vint16& b) { return _mm512_max_epi32(a, b); } + __forceinline vint16 max(const vint16& a, int b) { return max(a,vint16(b)); } + __forceinline vint16 max(int a, const vint16& b) { return max(vint16(a),b); } + + __forceinline vint16 umin(const vint16& a, const vint16& b) { return _mm512_min_epu32(a, b); } + __forceinline vint16 umax(const vint16& a, const vint16& b) { return _mm512_max_epu32(a, b); } + + __forceinline vint16 mask_add(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); } + __forceinline vint16 mask_sub(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); } + + __forceinline vint16 mask_and(const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_and_epi32(c,m,a,b); } + __forceinline vint16 mask_or (const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_or_epi32(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16& operator +=(vint16& a, const vint16& b) { return a = a + b; } + __forceinline vint16& operator +=(vint16& a, int b) { return a = a + b; } + + __forceinline vint16& operator -=(vint16& a, const vint16& b) { return a = a - b; } + __forceinline vint16& operator -=(vint16& a, int b) { return a = a - b; } + + __forceinline vint16& operator *=(vint16& a, const vint16& b) { return a = a * b; } + __forceinline vint16& operator *=(vint16& a, int b) { return a = a * b; } + + __forceinline vint16& operator &=(vint16& a, const vint16& b) { return a = a & b; } + __forceinline vint16& operator &=(vint16& a, int b) { return a = a & b; } + + __forceinline vint16& operator |=(vint16& a, const vint16& b) { return a = a | b; } + __forceinline vint16& operator |=(vint16& a, int b) { return a = a | b; } + + __forceinline vint16& operator <<=(vint16& a, int b) { return a = a << b; } + __forceinline vint16& operator >>=(vint16& a, int b) { return a = a >> b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator ==(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 operator ==(const vint16& a, int b) { return a == vint16(b); } + __forceinline vboolf16 operator ==(int a, const vint16& b) { return vint16(a) == b; } + + __forceinline vboolf16 operator !=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 operator !=(const vint16& a, int b) { return a != vint16(b); } + __forceinline vboolf16 operator !=(int a, const vint16& b) { return vint16(a) != b; } + + __forceinline vboolf16 operator < (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 operator < (const vint16& a, int b) { return a < vint16(b); } + __forceinline vboolf16 operator < (int a, const vint16& b) { return vint16(a) < b; } + + __forceinline vboolf16 operator >=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 operator >=(const vint16& a, int b) { return a >= vint16(b); } + __forceinline vboolf16 operator >=(int a, const vint16& b) { return vint16(a) >= b; } + + __forceinline vboolf16 operator > (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 operator > (const vint16& a, int b) { return a > vint16(b); } + __forceinline vboolf16 operator > (int a, const vint16& b) { return vint16(a) > b; } + + __forceinline vboolf16 operator <=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 operator <=(const vint16& a, int b) { return a <= vint16(b); } + __forceinline vboolf16 operator <=(int a, const vint16& b) { return vint16(a) <= b; } + + __forceinline vboolf16 eq(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_le(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_gt(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + + __forceinline vboolf16 eq(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 uint_gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); } + + + __forceinline vint16 select(const vboolf16& m, const vint16& t, const vint16& f) { + return _mm512_mask_or_epi32(f,m,t,t); + } + + __forceinline void xchg(const vboolf16& m, vint16& a, vint16& b) { + const vint16 c = a; a = select(m,b,a); b = select(m,c,b); + } + + __forceinline vboolf16 test(const vboolf16& m, const vint16& a, const vint16& b) { + return _mm512_mask_test_epi32_mask(m,a,b); + } + + __forceinline vboolf16 test(const vint16& a, const vint16& b) { + return _mm512_test_epi32_mask(a,b); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 unpacklo(const vint16& a, const vint16& b) { return _mm512_unpacklo_epi32(a, b); } + __forceinline vint16 unpackhi(const vint16& a, const vint16& b) { return _mm512_unpackhi_epi32(a, b); } + + template + __forceinline vint16 shuffle(const vint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vint16 shuffle(const vint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vint16 shuffle4(const vint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vint16 shuffle4(const vint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vint16 align_shift_right(const vint16& a, const vint16& b) { + return _mm512_alignr_epi32(a, b, i); + }; + + __forceinline int toScalar(const vint16& v) { + return _mm_cvtsi128_si32(_mm512_castsi512_si128(v)); + } + + template __forceinline vint16 insert4(const vint16& a, const vint4& b) { return _mm512_inserti32x4(a, b, i); } + + __forceinline size_t extract64bit(const vint16& v) { + return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); + } + + template + vint extractN(const vint16& v); + + template<> __forceinline vint4 extractN<4,0>(const vint16& v) { return _mm512_castsi512_si128(v); } + template<> __forceinline vint4 extractN<4,1>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 1); } + template<> __forceinline vint4 extractN<4,2>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 2); } + template<> __forceinline vint4 extractN<4,3>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 3); } + + template<> __forceinline vint8 extractN<8,0>(const vint16& v) { return _mm512_castsi512_si256(v); } + template<> __forceinline vint8 extractN<8,1>(const vint16& v) { return _mm512_extracti32x8_epi32(v, 1); } + + template __forceinline vint4 extract4 (const vint16& v) { return _mm512_extracti32x4_epi32(v, i); } + template<> __forceinline vint4 extract4<0>(const vint16& v) { return _mm512_castsi512_si128(v); } + + template __forceinline vint8 extract8 (const vint16& v) { return _mm512_extracti32x8_epi32(v, i); } + template<> __forceinline vint8 extract8<0>(const vint16& v) { return _mm512_castsi512_si256(v); } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 vreduce_min2(vint16 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vint16 vreduce_min4(vint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vint16 vreduce_min8(vint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } + __forceinline vint16 vreduce_min (vint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vint16 vreduce_max2(vint16 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vint16 vreduce_max4(vint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vint16 vreduce_max8(vint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } + __forceinline vint16 vreduce_max (vint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vint16 vreduce_and2(vint16 x) { return x & shuffle<1,0,3,2>(x); } + __forceinline vint16 vreduce_and4(vint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } + __forceinline vint16 vreduce_and8(vint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); } + __forceinline vint16 vreduce_and (vint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); } + + __forceinline vint16 vreduce_or2(vint16 x) { return x | shuffle<1,0,3,2>(x); } + __forceinline vint16 vreduce_or4(vint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } + __forceinline vint16 vreduce_or8(vint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); } + __forceinline vint16 vreduce_or (vint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); } + + __forceinline vint16 vreduce_add2(vint16 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vint16 vreduce_add4(vint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vint16 vreduce_add8(vint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } + __forceinline vint16 vreduce_add (vint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } + + __forceinline int reduce_min(const vint16& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint16& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_and(const vint16& v) { return toScalar(vreduce_and(v)); } + __forceinline int reduce_or (const vint16& v) { return toScalar(vreduce_or (v)); } + __forceinline int reduce_add(const vint16& v) { return toScalar(vreduce_add(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint16 conflict(const vint16& index) + { + return _mm512_conflict_epi32(index); + } + + __forceinline vint16 conflict(const vboolf16& mask, vint16& dest, const vint16& index) + { + return _mm512_mask_conflict_epi32(dest,mask,index); + } + + __forceinline vint16 convert_uint32_t(const __m512& f) { + return _mm512_cvtps_epu32(f); + } + + __forceinline vint16 permute(vint16 v, vint16 index) { + return _mm512_permutexvar_epi32(index,v); + } + + __forceinline vint16 reverse(const vint16 &a) { + return permute(a,vint16(reverse_step)); + } + + __forceinline vint16 prefix_sum(const vint16& a) + { + const vint16 z(zero); + vint16 v = a; + v = v + align_shift_right<16-1>(v,z); + v = v + align_shift_right<16-2>(v,z); + v = v + align_shift_right<16-4>(v,z); + v = v + align_shift_right<16-8>(v,z); + return v; + } + + __forceinline vint16 reverse_prefix_sum(const vint16& a) + { + const vint16 z(zero); + vint16 v = a; + v = v + align_shift_right<1>(z,v); + v = v + align_shift_right<2>(z,v); + v = v + align_shift_right<4>(z,v); + v = v + align_shift_right<8>(z,v); + return v; + } + + /* this should use a vbool8 and a vint8_64...*/ + template + __forceinline void gather_prefetch64(const void* base_addr, const vbool16& mask, const vint16& offset) + { +#if defined(__AVX512PF__) + _mm512_mask_prefetch_i64gather_pd(offset, mask, base_addr, scale, hint); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint16& v) + { + cout << "<" << v[0]; + for (int i=1; i<16; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vint4_sse2.h b/thirdparty/embree-aarch64/common/simd/vint4_sse2.h new file mode 100644 index 00000000000..96f105a7c53 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vint4_sse2.h @@ -0,0 +1,681 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../math/math.h" + +namespace embree +{ + /* 4-wide SSE integer type */ + template<> + struct vint<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128i v; int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint4& a) { v = a.v; } + __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; } + + __forceinline vint(__m128i a) : v(a) {} + __forceinline operator const __m128i&() const { return v; } + __forceinline operator __m128i&() { return v; } + + __forceinline vint(int a) : v(_mm_set1_epi32(a)) {} + __forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(d, c, b, a)) {} + + __forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a)) {} +#if defined(__AVX512VL__) + __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {} +#else + __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {} +#endif + + __forceinline vint(long long a, long long b) : v(_mm_set_epi64x(b,a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm_setzero_si128()) {} + __forceinline vint(OneTy) : v(_mm_set_epi32(1, 1, 1, 1)) {} + __forceinline vint(PosInfTy) : v(_mm_set_epi32(pos_inf, pos_inf, pos_inf, pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm_set_epi32(neg_inf, neg_inf, neg_inf, neg_inf)) {} + __forceinline vint(StepTy) : v(_mm_set_epi32(3, 2, 1, 0)) {} + __forceinline vint(ReverseStepTy) : v(_mm_set_epi32(0, 1, 2, 3)) {} + + __forceinline vint(TrueTy) { v = _mm_cmpeq_epi32(v,v); } + __forceinline vint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {} + + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint4 load (const void* a) { return _mm_load_si128((__m128i*)a); } + static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); } + + static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); } + +#if defined(__AVX512VL__) + + static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) { + return _mm_mask_compress_epi32(v, mask, v); + } + static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) { + return _mm_mask_compress_epi32(a, mask, b); + } + + static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); } + static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); } +#elif defined(__AVX__) + static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } +#else + static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); } + static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,select(mask,i,load (ptr))); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); } +#endif + + +#if defined(__aarch64__) + static __forceinline vint4 load(const uint8_t* ptr) { + return _mm_load4epu8_epi32(((__m128i*)ptr)); + } + static __forceinline vint4 loadu(const uint8_t* ptr) { + return _mm_load4epu8_epi32(((__m128i*)ptr)); + } +#elif defined(__SSE4_1__) + static __forceinline vint4 load(const uint8_t* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } + + static __forceinline vint4 loadu(const uint8_t* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } +#else + + static __forceinline vint4 load(const uint8_t* ptr) { + return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); + } + + static __forceinline vint4 loadu(const uint8_t* ptr) { + return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); + } + +#endif + + static __forceinline vint4 load(const unsigned short* ptr) { +#if defined(__aarch64__) + return __m128i(vmovl_u16(vld1_u16(ptr))); +#elif defined (__SSE4_1__) + return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); +#else + return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); +#endif + } + + static __forceinline void store(uint8_t* ptr, const vint4& v) { +#if defined(__aarch64__) + int32x4_t x = v; + uint16x4_t y = vqmovn_u32(uint32x4_t(x)); + uint8x8_t z = vqmovn_u16(vcombine_u16(y, y)); + vst1_lane_u32((uint32_t *)ptr,uint32x2_t(z), 0); +#elif defined(__SSE4_1__) + __m128i x = v; + x = _mm_packus_epi32(x, x); + x = _mm_packus_epi16(x, x); + *(int*)ptr = _mm_cvtsi128_si32(x); +#else + for (size_t i=0;i<4;i++) + ptr[i] = (uint8_t)v[i]; +#endif + } + + static __forceinline void store(unsigned short* ptr, const vint4& v) { +#if defined(__aarch64__) + uint32x4_t x = uint32x4_t(v.v); + uint16x4_t y = vqmovn_u32(x); + vst1_u16(ptr, y); +#else + for (size_t i=0;i<4;i++) + ptr[i] = (unsigned short)v[i]; +#endif + } + + static __forceinline vint4 load_nt(void* ptr) { +#if defined(__aarch64__) || defined(__SSE4_1__) + return _mm_stream_load_si128((__m128i*)ptr); +#else + return _mm_load_si128((__m128i*)ptr); +#endif + } + + static __forceinline void store_nt(void* ptr, const vint4& v) { +#if !defined(__aarch64__) && defined(__SSE4_1__) + _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); +#else + _mm_store_si128((__m128i*)ptr,v); +#endif + } + + template + static __forceinline vint4 gather(const int* ptr, const vint4& index) { +#if defined(__AVX2__) && !defined(__aarch64__) + return _mm_i32gather_epi32(ptr, index, scale); +#else + return vint4( + *(int*)(((int8_t*)ptr)+scale*index[0]), + *(int*)(((int8_t*)ptr)+scale*index[1]), + *(int*)(((int8_t*)ptr)+scale*index[2]), + *(int*)(((int8_t*)ptr)+scale*index[3])); +#endif + } + + template + static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) { + vint4 r = zero; +#if defined(__AVX512VL__) + return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); +#elif defined(__AVX2__) && !defined(__aarch64__) + return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]); + return r; +#endif + } + + template + static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v) + { +#if defined(__AVX512VL__) + _mm_i32scatter_epi32((int*)ptr, index, v, scale); +#else + *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0]; + *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1]; + *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2]; + *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3]; +#endif + } + + template + static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v) + { +#if defined(__AVX512VL__) + _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale); +#else + if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0]; + if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1]; + if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2]; + if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3]; +#endif + } + +#if defined(__x86_64__) || defined(__aarch64__) + static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator [](size_t index) const { assert(index < 4); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < 4); return i[index]; } + + friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) { +#if defined(__AVX512VL__) + return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t); +#elif defined(__aarch64__) + return _mm_castps_si128(_mm_blendv_ps((__m128)f.v,(__m128) t.v, (__m128)m.v)); +#elif defined(__SSE4_1__) + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); +#endif + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); } +#else + __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a); } +#endif + + __forceinline vint4 operator +(const vint4& a) { return a; } + __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } +#if defined(__aarch64__) + __forceinline vint4 abs(const vint4& a) { return vabsq_s32(a.v); } +#elif defined(__SSSE3__) + __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a, b); } + __forceinline vint4 operator +(const vint4& a, int b) { return a + vint4(b); } + __forceinline vint4 operator +(int a, const vint4& b) { return vint4(a) + b; } + + __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a, b); } + __forceinline vint4 operator -(const vint4& a, int b) { return a - vint4(b); } + __forceinline vint4 operator -(int a, const vint4& b) { return vint4(a) - b; } + +#if (defined(__aarch64__)) || defined(__SSE4_1__) + __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); } +#else + __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); } +#endif + __forceinline vint4 operator *(const vint4& a, int b) { return a * vint4(b); } + __forceinline vint4 operator *(int a, const vint4& b) { return vint4(a) * b; } + + __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a, b); } + __forceinline vint4 operator &(const vint4& a, int b) { return a & vint4(b); } + __forceinline vint4 operator &(int a, const vint4& b) { return vint4(a) & b; } + + __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a, b); } + __forceinline vint4 operator |(const vint4& a, int b) { return a | vint4(b); } + __forceinline vint4 operator |(int a, const vint4& b) { return vint4(a) | b; } + + __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a, b); } + __forceinline vint4 operator ^(const vint4& a, int b) { return a ^ vint4(b); } + __forceinline vint4 operator ^(int a, const vint4& b) { return vint4(a) ^ b; } + + __forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a, n); } + __forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a, n); } + + __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); } + __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); } + __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; } + __forceinline vint4& operator +=(vint4& a, int b) { return a = a + b; } + + __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; } + __forceinline vint4& operator -=(vint4& a, int b) { return a = a - b; } + +#if (defined(__aarch64__)) || defined(__SSE4_1__) + __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; } + __forceinline vint4& operator *=(vint4& a, int b) { return a = a * b; } +#endif + + __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; } + __forceinline vint4& operator &=(vint4& a, int b) { return a = a & b; } + + __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; } + __forceinline vint4& operator |=(vint4& a, int b) { return a = a | b; } + + __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; } + __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } +#else + __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } + __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); } + __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a, b)); } + __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a < b); } + __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a, b)); } + __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a > b); } +#endif + + __forceinline vboolf4 operator ==(const vint4& a, int b) { return a == vint4(b); } + __forceinline vboolf4 operator ==(int a, const vint4& b) { return vint4(a) == b; } + + __forceinline vboolf4 operator !=(const vint4& a, int b) { return a != vint4(b); } + __forceinline vboolf4 operator !=(int a, const vint4& b) { return vint4(a) != b; } + + __forceinline vboolf4 operator < (const vint4& a, int b) { return a < vint4(b); } + __forceinline vboolf4 operator < (int a, const vint4& b) { return vint4(a) < b; } + + __forceinline vboolf4 operator >=(const vint4& a, int b) { return a >= vint4(b); } + __forceinline vboolf4 operator >=(int a, const vint4& b) { return vint4(a) >= b; } + + __forceinline vboolf4 operator > (const vint4& a, int b) { return a > vint4(b); } + __forceinline vboolf4 operator > (int a, const vint4& b) { return vint4(a) > b; } + + __forceinline vboolf4 operator <=(const vint4& a, int b) { return a <= vint4(b); } + __forceinline vboolf4 operator <=(int a, const vint4& b) { return vint4(a) <= b; } + + __forceinline vboolf4 eq(const vint4& a, const vint4& b) { return a == b; } + __forceinline vboolf4 ne(const vint4& a, const vint4& b) { return a != b; } + __forceinline vboolf4 lt(const vint4& a, const vint4& b) { return a < b; } + __forceinline vboolf4 ge(const vint4& a, const vint4& b) { return a >= b; } + __forceinline vboolf4 gt(const vint4& a, const vint4& b) { return a > b; } + __forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); } + __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); } + __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a < b); } + __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >= b); } + __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a > b); } + __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <= b); } +#endif + + template + __forceinline vint4 select(const vint4& t, const vint4& f) { +#if defined(__SSE4_1__) + return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); +#else + return select(vboolf4(mask), t, f); +#endif + } + + +#if defined(__aarch64__) || defined(__SSE4_1__) + __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); } + __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); } + + __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); } + __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); } + +#else + __forceinline vint4 min(const vint4& a, const vint4& b) { return select(a < b,a,b); } + __forceinline vint4 max(const vint4& a, const vint4& b) { return select(a < b,b,a); } +#endif + + __forceinline vint4 min(const vint4& a, int b) { return min(a,vint4(b)); } + __forceinline vint4 min(int a, const vint4& b) { return min(vint4(a),b); } + __forceinline vint4 max(const vint4& a, int b) { return max(a,vint4(b)); } + __forceinline vint4 max(int a, const vint4& b) { return max(vint4(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + +#if defined(__aarch64__) + template + __forceinline vint4 shuffle(const vint4& v) { + return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); + } + template + __forceinline vint4 shuffle(const vint4& a, const vint4& b) { + return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); + } +#else + template + __forceinline vint4 shuffle(const vint4& v) { + return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + template + __forceinline vint4 shuffle(const vint4& a, const vint4& b) { + return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } +#endif +#if defined(__SSE3__) + template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); } +#endif + + template + __forceinline vint4 shuffle(const vint4& v) { + return shuffle(v); + } + +#if defined(__aarch64__) + template __forceinline int extract(const vint4& b); + template __forceinline vint4 insert(const vint4& a, const int b); +#elif defined(__SSE4_1__) + template __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); } + template __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); } +#else + template __forceinline int extract(const vint4& b) { return b[src&3]; } + template __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; } +#endif + +#if defined(__aarch64__) + template<> __forceinline int extract<0>(const vint4& b) { + return b.v[0]; + } + template<> __forceinline int extract<1>(const vint4& b) { + return b.v[1]; + } + template<> __forceinline int extract<2>(const vint4& b) { + return b.v[2]; + } + template<> __forceinline int extract<3>(const vint4& b) { + return b.v[3]; + } + template<> __forceinline vint4 insert<0>(const vint4& a, int b) + { + vint4 c = a; + c[0] = b; + return c; + } + template<> __forceinline vint4 insert<1>(const vint4& a, int b) + { + vint4 c = a; + c[1] = b; + return c; + } + template<> __forceinline vint4 insert<2>(const vint4& a, int b) + { + vint4 c = a; + c[2] = b; + return c; + } + template<> __forceinline vint4 insert<3>(const vint4& a, int b) + { + vint4 c = a; + c[3] = b; + return c; + } + + __forceinline int toScalar(const vint4& v) { + return v[0]; + } + + __forceinline size_t toSizeT(const vint4& v) { + uint64x2_t x = uint64x2_t(v.v); + return x[0]; + } +#else + template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); } + + __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); } + + __forceinline size_t toSizeT(const vint4& v) { +#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround + return toScalar(v); +#elif defined(__ARM_NEON) + // FIXME(LTE): Do we need a swap(i.e. use lane 1)? + return vgetq_lane_u64(*(reinterpret_cast(&v)), 0); +#else + return _mm_cvtsi128_si64(v); +#endif + } +#endif + +#if defined(__AVX512VL__) + + __forceinline vint4 permute(const vint4 &a, const vint4 &index) { + return _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index)); + } + + template + __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) { + return _mm_alignr_epi32(a, b, i); + } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__aarch64__) || defined(__SSE4_1__) + +#if defined(__aarch64__) + __forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v); return vdupq_n_s32(h); } + __forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v); return vdupq_n_s32(h); } + __forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v); return vdupq_n_s32(h); } + + __forceinline int reduce_min(const vint4& v) { return vminvq_s32(v); } + __forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v); } + __forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v); } +#else + __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } + __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } + __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } + + __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); } +#endif + + __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); } + + __forceinline size_t select_min(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + __forceinline size_t select_max(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + +#else + + __forceinline int reduce_min(const vint4& v) { return min(v[0],v[1],v[2],v[3]); } + __forceinline int reduce_max(const vint4& v) { return max(v[0],v[1],v[2],v[3]); } + __forceinline int reduce_add(const vint4& v) { return v[0]+v[1]+v[2]+v[3]; } + +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting networks + //////////////////////////////////////////////////////////////////////////////// + +#if (defined(__aarch64__)) || defined(__SSE4_1__) + + __forceinline vint4 usort_ascending(const vint4& v) + { + const vint4 a0 = v; + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = umin(a0,b0); + const vint4 d0 = umax(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = umin(a1,b1); + const vint4 d1 = umax(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = umin(a2,b2); + const vint4 d2 = umax(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + + __forceinline vint4 usort_descending(const vint4& v) + { + const vint4 a0 = v; + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = umax(a0,b0); + const vint4 d0 = umin(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = umax(a1,b1); + const vint4 d1 = umin(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = umax(a2,b2); + const vint4 d2 = umin(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3; + } + +#else + + __forceinline vint4 usort_ascending(const vint4& v) + { + const vint4 a0 = v-vint4(0x80000000); + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = min(a0,b0); + const vint4 d0 = max(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = min(a1,b1); + const vint4 d1 = max(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = min(a2,b2); + const vint4 d2 = max(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3+vint4(0x80000000); + } + + __forceinline vint4 usort_descending(const vint4& v) + { + const vint4 a0 = v-vint4(0x80000000); + const vint4 b0 = shuffle<1,0,3,2>(a0); + const vint4 c0 = max(a0,b0); + const vint4 d0 = min(a0,b0); + const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0); + const vint4 b1 = shuffle<2,3,0,1>(a1); + const vint4 c1 = max(a1,b1); + const vint4 d1 = min(a1,b1); + const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1); + const vint4 b2 = shuffle<0,2,1,3>(a2); + const vint4 c2 = max(a2,b2); + const vint4 d2 = min(a2,b2); + const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2); + return a3+vint4(0x80000000); + } + +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } +} + diff --git a/thirdparty/embree-aarch64/common/simd/vint8_avx.h b/thirdparty/embree-aarch64/common/simd/vint8_avx.h new file mode 100644 index 00000000000..25a771284da --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vint8_avx.h @@ -0,0 +1,464 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + struct { __m128i vl,vh; }; + int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint8& a) { v = a.v; } + __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; } + + __forceinline vint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {} + + __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {} + + __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vint(OneTy) : v(_mm256_set_epi32(1,1,1,1,1,1,1,1)) {} + __forceinline vint(PosInfTy) : v(_mm256_set_epi32(pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm256_set_epi32(neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf)) {} + __forceinline vint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {} + __forceinline vint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); } + static __forceinline vint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); } + + static __forceinline vint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + static __forceinline vint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + + static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } + +#if !defined(__aarch64__) + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } +#else + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } +#endif + + static __forceinline void store_nt(void* ptr, const vint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline vint8 load(const uint8_t* ptr) { + vint4 il = vint4::load(ptr+0); + vint4 ih = vint4::load(ptr+4); + return vint8(il,ih); + } + + static __forceinline vint8 loadu(const uint8_t* ptr) { + vint4 il = vint4::loadu(ptr+0); + vint4 ih = vint4::loadu(ptr+4); + return vint8(il,ih); + } + + static __forceinline vint8 load(const unsigned short* ptr) { + vint4 il = vint4::load(ptr+0); + vint4 ih = vint4::load(ptr+4); + return vint8(il,ih); + } + + static __forceinline vint8 loadu(const unsigned short* ptr) { + vint4 il = vint4::loadu(ptr+0); + vint4 ih = vint4::loadu(ptr+4); + return vint8(il,ih); + } + + static __forceinline void store(uint8_t* ptr, const vint8& i) { + vint4 il(i.vl); + vint4 ih(i.vh); + vint4::store(ptr + 0,il); + vint4::store(ptr + 4,ih); + } + + static __forceinline void store(unsigned short* ptr, const vint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template + static __forceinline vint8 gather(const int* ptr, const vint8& index) { + return vint8( + *(int*)(((int8_t*)ptr)+scale*index[0]), + *(int*)(((int8_t*)ptr)+scale*index[1]), + *(int*)(((int8_t*)ptr)+scale*index[2]), + *(int*)(((int8_t*)ptr)+scale*index[3]), + *(int*)(((int8_t*)ptr)+scale*index[4]), + *(int*)(((int8_t*)ptr)+scale*index[5]), + *(int*)(((int8_t*)ptr)+scale*index[6]), + *(int*)(((int8_t*)ptr)+scale*index[7])); + } + + template + static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) { + vint8 r = zero; + if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]); + if (likely(mask[4])) r[4] = *(int*)(((int8_t*)ptr)+scale*index[4]); + if (likely(mask[5])) r[5] = *(int*)(((int8_t*)ptr)+scale*index[5]); + if (likely(mask[6])) r[6] = *(int*)(((int8_t*)ptr)+scale*index[6]); + if (likely(mask[7])) r[7] = *(int*)(((int8_t*)ptr)+scale*index[7]); + return r; + } + + template + static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v) + { + *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; + } + + template + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v) + { + if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; + } + + + static __forceinline vint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); } + + __forceinline vint8 operator +(const vint8& a) { return a; } + __forceinline vint8 operator -(const vint8& a) { return vint8(_mm_sub_epi32(_mm_setzero_si128(), a.vl), _mm_sub_epi32(_mm_setzero_si128(), a.vh)); } + __forceinline vint8 abs (const vint8& a) { return vint8(_mm_abs_epi32(a.vl), _mm_abs_epi32(a.vh)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 operator +(const vint8& a, const vint8& b) { return vint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); } + __forceinline vint8 operator +(const vint8& a, int b) { return a + vint8(b); } + __forceinline vint8 operator +(int a, const vint8& b) { return vint8(a) + b; } + + __forceinline vint8 operator -(const vint8& a, const vint8& b) { return vint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); } + __forceinline vint8 operator -(const vint8& a, int b) { return a - vint8(b); } + __forceinline vint8 operator -(int a, const vint8& b) { return vint8(a) - b; } + + __forceinline vint8 operator *(const vint8& a, const vint8& b) { return vint8(_mm_mullo_epi32(a.vl, b.vl), _mm_mullo_epi32(a.vh, b.vh)); } + __forceinline vint8 operator *(const vint8& a, int b) { return a * vint8(b); } + __forceinline vint8 operator *(int a, const vint8& b) { return vint8(a) * b; } + + __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 operator &(const vint8& a, int b) { return a & vint8(b); } + __forceinline vint8 operator &(int a, const vint8& b) { return vint8(a) & b; } + + __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 operator |(const vint8& a, int b) { return a | vint8(b); } + __forceinline vint8 operator |(int a, const vint8& b) { return vint8(a) | b; } + + __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 operator ^(const vint8& a, int b) { return a ^ vint8(b); } + __forceinline vint8 operator ^(int a, const vint8& b) { return vint8(a) ^ b; } + + __forceinline vint8 operator <<(const vint8& a, int n) { return vint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); } + __forceinline vint8 operator >>(const vint8& a, int n) { return vint8(_mm_srai_epi32(a.vl, n), _mm_srai_epi32(a.vh, n)); } + + __forceinline vint8 sll (const vint8& a, int b) { return vint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); } + __forceinline vint8 sra (const vint8& a, int b) { return vint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); } + __forceinline vint8 srl (const vint8& a, int b) { return vint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); } + + __forceinline vint8 min(const vint8& a, const vint8& b) { return vint8(_mm_min_epi32(a.vl, b.vl), _mm_min_epi32(a.vh, b.vh)); } + __forceinline vint8 min(const vint8& a, int b) { return min(a,vint8(b)); } + __forceinline vint8 min(int a, const vint8& b) { return min(vint8(a),b); } + + __forceinline vint8 max(const vint8& a, const vint8& b) { return vint8(_mm_max_epi32(a.vl, b.vl), _mm_max_epi32(a.vh, b.vh)); } + __forceinline vint8 max(const vint8& a, int b) { return max(a,vint8(b)); } + __forceinline vint8 max(int a, const vint8& b) { return max(vint8(a),b); } + + __forceinline vint8 umin(const vint8& a, const vint8& b) { return vint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); } + __forceinline vint8 umin(const vint8& a, int b) { return umin(a,vint8(b)); } + __forceinline vint8 umin(int a, const vint8& b) { return umin(vint8(a),b); } + + __forceinline vint8 umax(const vint8& a, const vint8& b) { return vint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); } + __forceinline vint8 umax(const vint8& a, int b) { return umax(a,vint8(b)); } + __forceinline vint8 umax(int a, const vint8& b) { return umax(vint8(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; } + __forceinline vint8& operator +=(vint8& a, int b) { return a = a + b; } + + __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; } + __forceinline vint8& operator -=(vint8& a, int b) { return a = a - b; } + + __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; } + __forceinline vint8& operator *=(vint8& a, int b) { return a = a * b; } + + __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; } + __forceinline vint8& operator &=(vint8& a, int b) { return a = a & b; } + + __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; } + __forceinline vint8& operator |=(vint8& a, int b) { return a = a | b; } + + __forceinline vint8& operator <<=(vint8& a, int b) { return a = a << b; } + __forceinline vint8& operator >>=(vint8& a, int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator ==(const vint8& a, int b) { return a == vint8(b); } + __forceinline vboolf8 operator ==(int a, const vint8& b) { return vint8(a) == b; } + + __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); } + __forceinline vboolf8 operator !=(const vint8& a, int b) { return a != vint8(b); } + __forceinline vboolf8 operator !=(int a, const vint8& b) { return vint8(a) != b; } + + __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmplt_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator < (const vint8& a, int b) { return a < vint8(b); } + __forceinline vboolf8 operator < (int a, const vint8& b) { return vint8(a) < b; } + + __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a < b); } + __forceinline vboolf8 operator >=(const vint8& a, int b) { return a >= vint8(b); } + __forceinline vboolf8 operator >=(int a, const vint8& b) { return vint8(a) >= b; } + + __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmpgt_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator > (const vint8& a, int b) { return a > vint8(b); } + __forceinline vboolf8 operator > (int a, const vint8& b) { return vint8(a) > b; } + + __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a > b); } + __forceinline vboolf8 operator <=(const vint8& a, int b) { return a <= vint8(b); } + __forceinline vboolf8 operator <=(int a, const vint8& b) { return vint8(a) <= b; } + + __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; } + __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; } + __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a < b; } + __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; } + __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a > b; } + __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; } + + __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); } + __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); } + __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a < b); } + __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); } + __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a > b); } + __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); } + + __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } + + __forceinline vint8 notand(const vboolf8& m, const vint8& f) { + return _mm256_castps_si256(_mm256_andnot_ps(m, _mm256_castsi256_ps(f))); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + + template + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vint8 shuffle4(const vint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vint8 shuffle4(const vint8& a, const vint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vint8 shuffle(const vint8& a, const vint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } + template __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); } + template __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting networks + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 usort_ascending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umin(a0,b0); + const vint8 d0 = umax(a0,b0); + const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umin(a1,b1); + const vint8 d1 = umax(a1,b1); + const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umin(a2,b2); + const vint8 d2 = umax(a2,b2); + const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umin(a3,b3); + const vint8 d3 = umax(a3,b3); + const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umin(a4,b4); + const vint8 d4 = umax(a4,b4); + const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umin(a5,b5); + const vint8 d5 = umax(a5,b5); + const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5); + return a6; + } + + __forceinline vint8 usort_descending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umax(a0,b0); + const vint8 d0 = umin(a0,b0); + const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umax(a1,b1); + const vint8 d1 = umin(a1,b1); + const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umax(a2,b2); + const vint8 d2 = umin(a2,b2); + const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umax(a3,b3); + const vint8 d3 = umin(a3,b3); + const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umax(a4,b4); + const vint8 d4 = umin(a4,b4); + const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umax(a5,b5); + const vint8 d5 = umin(a5,b5); + const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5); + return a6; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vint8_avx2.h b/thirdparty/embree-aarch64/common/simd/vint8_avx2.h new file mode 100644 index 00000000000..4937d972cfd --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vint8_avx2.h @@ -0,0 +1,512 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint() {} + __forceinline vint(const vint8& a) { v = a.v; } + __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; } + + __forceinline vint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + + __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {} + + __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + +#if defined(__AVX512VL__) + __forceinline explicit vint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {} +#else + __forceinline explicit vint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {} +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vint(OneTy) : v(_mm256_set1_epi32(1)) {} + __forceinline vint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {} + __forceinline vint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {} + __forceinline vint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {} + __forceinline vint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vint8 load(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); } + static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); } + + static __forceinline vint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); } + static __forceinline vint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); } + + static __forceinline void store (void* ptr, const vint8& v) { _mm256_store_si256((__m256i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); } + +#if defined(__AVX512VL__) + + static __forceinline vint8 compact(const vboolf8& mask, vint8 &v) { + return _mm256_mask_compress_epi32(v, mask, v); + } + static __forceinline vint8 compact(const vboolf8& mask, vint8 &a, const vint8& b) { + return _mm256_mask_compress_epi32(a, mask, b); + } + + static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); } + static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); } +#else + static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } +#endif + + static __forceinline vint8 load_nt(void* ptr) { + return _mm256_stream_load_si256((__m256i*)ptr); + } + + static __forceinline void store_nt(void* ptr, const vint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline void store(uint8_t* ptr, const vint8& i) + { + for (size_t j=0; j<8; j++) + ptr[j] = i[j]; + } + + static __forceinline void store(unsigned short* ptr, const vint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template + static __forceinline vint8 gather(const int *const ptr, const vint8& index) { + return _mm256_i32gather_epi32(ptr, index, scale); + } + + template + static __forceinline vint8 gather(const vboolf8& mask, const int *const ptr, const vint8& index) { + vint8 r = zero; +#if defined(__AVX512VL__) + return _mm256_mmask_i32gather_epi32(r, mask, index, ptr, scale); +#else + return _mm256_mask_i32gather_epi32(r, ptr, index, mask, scale); +#endif + } + + template + static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v) + { +#if defined(__AVX512VL__) + _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale); +#else + *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + template + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v) + { +#if defined(__AVX512VL__) + _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale); +#else + if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + static __forceinline vint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_movepi32_mask(a); } +#else + static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); } +#endif + + __forceinline vint8 operator +(const vint8& a) { return a; } + __forceinline vint8 operator -(const vint8& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a); } + __forceinline vint8 abs (const vint8& a) { return _mm256_abs_epi32(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 operator +(const vint8& a, const vint8& b) { return _mm256_add_epi32(a, b); } + __forceinline vint8 operator +(const vint8& a, int b) { return a + vint8(b); } + __forceinline vint8 operator +(int a, const vint8& b) { return vint8(a) + b; } + + __forceinline vint8 operator -(const vint8& a, const vint8& b) { return _mm256_sub_epi32(a, b); } + __forceinline vint8 operator -(const vint8& a, int b) { return a - vint8(b); } + __forceinline vint8 operator -(int a, const vint8& b) { return vint8(a) - b; } + + __forceinline vint8 operator *(const vint8& a, const vint8& b) { return _mm256_mullo_epi32(a, b); } + __forceinline vint8 operator *(const vint8& a, int b) { return a * vint8(b); } + __forceinline vint8 operator *(int a, const vint8& b) { return vint8(a) * b; } + + __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_and_si256(a, b); } + __forceinline vint8 operator &(const vint8& a, int b) { return a & vint8(b); } + __forceinline vint8 operator &(int a, const vint8& b) { return vint8(a) & b; } + + __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_or_si256(a, b); } + __forceinline vint8 operator |(const vint8& a, int b) { return a | vint8(b); } + __forceinline vint8 operator |(int a, const vint8& b) { return vint8(a) | b; } + + __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_xor_si256(a, b); } + __forceinline vint8 operator ^(const vint8& a, int b) { return a ^ vint8(b); } + __forceinline vint8 operator ^(int a, const vint8& b) { return vint8(a) ^ b; } + + __forceinline vint8 operator <<(const vint8& a, int n) { return _mm256_slli_epi32(a, n); } + __forceinline vint8 operator >>(const vint8& a, int n) { return _mm256_srai_epi32(a, n); } + + __forceinline vint8 operator <<(const vint8& a, const vint8& n) { return _mm256_sllv_epi32(a, n); } + __forceinline vint8 operator >>(const vint8& a, const vint8& n) { return _mm256_srav_epi32(a, n); } + + __forceinline vint8 sll(const vint8& a, int b) { return _mm256_slli_epi32(a, b); } + __forceinline vint8 sra(const vint8& a, int b) { return _mm256_srai_epi32(a, b); } + __forceinline vint8 srl(const vint8& a, int b) { return _mm256_srli_epi32(a, b); } + + __forceinline vint8 sll(const vint8& a, const vint8& b) { return _mm256_sllv_epi32(a, b); } + __forceinline vint8 sra(const vint8& a, const vint8& b) { return _mm256_srav_epi32(a, b); } + __forceinline vint8 srl(const vint8& a, const vint8& b) { return _mm256_srlv_epi32(a, b); } + + __forceinline vint8 min(const vint8& a, const vint8& b) { return _mm256_min_epi32(a, b); } + __forceinline vint8 min(const vint8& a, int b) { return min(a,vint8(b)); } + __forceinline vint8 min(int a, const vint8& b) { return min(vint8(a),b); } + + __forceinline vint8 max(const vint8& a, const vint8& b) { return _mm256_max_epi32(a, b); } + __forceinline vint8 max(const vint8& a, int b) { return max(a,vint8(b)); } + __forceinline vint8 max(int a, const vint8& b) { return max(vint8(a),b); } + + __forceinline vint8 umin(const vint8& a, const vint8& b) { return _mm256_min_epu32(a, b); } + __forceinline vint8 umax(const vint8& a, const vint8& b) { return _mm256_max_epu32(a, b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; } + __forceinline vint8& operator +=(vint8& a, int b) { return a = a + b; } + + __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; } + __forceinline vint8& operator -=(vint8& a, int b) { return a = a - b; } + + __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; } + __forceinline vint8& operator *=(vint8& a, int b) { return a = a * b; } + + __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; } + __forceinline vint8& operator &=(vint8& a, int b) { return a = a & b; } + + __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; } + __forceinline vint8& operator |=(vint8& a, int b) { return a = a | b; } + + __forceinline vint8& operator <<=(vint8& a, const int b) { return a = a << b; } + __forceinline vint8& operator >>=(vint8& a, const int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); } + static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_NE); } + static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LT); } + static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GE); } + static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GT); } + static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LE); } + + static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { + return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t); + } +#else + static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); } + static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); } + static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b, a)); } + static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a < b); } + static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, b)); } + static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a > b); } + + static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } +#endif + + template + __forceinline vint8 select(const vint8& t, const vint8& f) { + return _mm256_blend_epi32(f, t, mask); + } + + __forceinline vboolf8 operator ==(const vint8& a, int b) { return a == vint8(b); } + __forceinline vboolf8 operator ==(int a, const vint8& b) { return vint8(a) == b; } + + __forceinline vboolf8 operator !=(const vint8& a, int b) { return a != vint8(b); } + __forceinline vboolf8 operator !=(int a, const vint8& b) { return vint8(a) != b; } + + __forceinline vboolf8 operator < (const vint8& a, int b) { return a < vint8(b); } + __forceinline vboolf8 operator < (int a, const vint8& b) { return vint8(a) < b; } + + __forceinline vboolf8 operator >=(const vint8& a, int b) { return a >= vint8(b); } + __forceinline vboolf8 operator >=(int a, const vint8& b) { return vint8(a) >= b; } + + __forceinline vboolf8 operator > (const vint8& a, int b) { return a > vint8(b); } + __forceinline vboolf8 operator > (int a, const vint8& b) { return vint8(a) > b; } + + __forceinline vboolf8 operator <=(const vint8& a, int b) { return a <= vint8(b); } + __forceinline vboolf8 operator <=(int a, const vint8& b) { return vint8(a) <= b; } + + __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; } + __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; } + __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a < b; } + __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; } + __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a > b; } + __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; } + +#if defined(__AVX512VL__) + static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); } + static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); } + static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); } + static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a < b); } + static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); } + static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a > b); } + static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_unpacklo_epi32(a, b); } + __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_unpackhi_epi32(a, b); } + + template + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vint8 shuffle4(const vint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vint8 shuffle4(const vint8& a, const vint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vint8 shuffle(const vint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vint8 shuffle(const vint8& a, const vint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } + + template __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); } + template __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + +#if !defined(__aarch64__) + +__forceinline vint8 permute(const vint8& v, const __m256i& index) { + return _mm256_permutevar8x32_epi32(v, index); + } + + __forceinline vint8 shuffle(const vint8& v, const __m256i& index) { + return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index)); + } + + + + template + static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) { +#if defined(__AVX512VL__) + return _mm256_alignr_epi32(a, b, i); +#else + return _mm256_alignr_epi8(a, b, 4*i); +#endif + } + +#endif + + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); } + __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + + __forceinline vint8 assign(const vint4& a) { return _mm256_castsi128_si256(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Sorting networks + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vint8 usort_ascending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umin(a0,b0); + const vint8 d0 = umax(a0,b0); + const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umin(a1,b1); + const vint8 d1 = umax(a1,b1); + const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umin(a2,b2); + const vint8 d2 = umax(a2,b2); + const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umin(a3,b3); + const vint8 d3 = umax(a3,b3); + const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umin(a4,b4); + const vint8 d4 = umax(a4,b4); + const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umin(a5,b5); + const vint8 d5 = umax(a5,b5); + const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + __forceinline vint8 usort_descending(const vint8& v) + { + const vint8 a0 = v; + const vint8 b0 = shuffle<1,0,3,2>(a0); + const vint8 c0 = umax(a0,b0); + const vint8 d0 = umin(a0,b0); + const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0); + const vint8 b1 = shuffle<2,3,0,1>(a1); + const vint8 c1 = umax(a1,b1); + const vint8 d1 = umin(a1,b1); + const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1); + const vint8 b2 = shuffle<1,0,3,2>(a2); + const vint8 c2 = umax(a2,b2); + const vint8 d2 = umin(a2,b2); + const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2); + const vint8 b3 = shuffle4<1,0>(a3); + const vint8 c3 = umax(a3,b3); + const vint8 d3 = umin(a3,b3); + const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3); + const vint8 b4 = shuffle<2,3,0,1>(a4); + const vint8 c4 = umax(a4,b4); + const vint8 d4 = umin(a4,b4); + const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4); + const vint8 b5 = shuffle<1,0,3,2>(a5); + const vint8 c5 = umax(a5,b5); + const vint8 d5 = umin(a5,b5); + const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5); + return a6; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h b/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h new file mode 100644 index 00000000000..de3ebc16a72 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h @@ -0,0 +1,358 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 4-wide AVX2 64-bit long long type */ + template<> + struct vllong<4> + { + ALIGNED_STRUCT_(32); + + typedef vboold4 Bool; + + enum { size = 4 }; // number of SIMD elements + union { // data + __m256i v; + long long i[4]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong() {} + __forceinline vllong(const vllong4& t) { v = t.v; } + __forceinline vllong4& operator =(const vllong4& f) { v = f.v; return *this; } + + __forceinline vllong(const __m256i& t) { v = t; } + __forceinline operator __m256i() const { return v; } + __forceinline operator __m256d() const { return _mm256_castsi256_pd(v); } + + + __forceinline vllong(long long i) { + v = _mm256_set1_epi64x(i); + } + + __forceinline vllong(long long a, long long b, long long c, long long d) { + v = _mm256_set_epi64x(d,c,b,a); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vllong(OneTy) : v(_mm256_set1_epi64x(1)) {} + __forceinline vllong(StepTy) : v(_mm256_set_epi64x(3,2,1,0)) {} + __forceinline vllong(ReverseStepTy) : v(_mm256_set_epi64x(0,1,2,3)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void* __restrict__ ptr, const vllong4& a) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(a)); + } + + static __forceinline vllong4 loadu(const void* addr) + { + return _mm256_loadu_si256((__m256i*)addr); + } + + static __forceinline vllong4 load(const vllong4* addr) { + return _mm256_load_si256((__m256i*)addr); + } + + static __forceinline vllong4 load(const long long* addr) { + return _mm256_load_si256((__m256i*)addr); + } + + static __forceinline void store(void* ptr, const vllong4& v) { + _mm256_store_si256((__m256i*)ptr,v); + } + + static __forceinline void storeu(void* ptr, const vllong4& v) { + _mm256_storeu_si256((__m256i*)ptr,v); + } + + static __forceinline void storeu(const vboold4& mask, long long* ptr, const vllong4& f) { +#if defined(__AVX512VL__) + _mm256_mask_storeu_epi64(ptr,mask,f); +#else + _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f)); +#endif + } + + static __forceinline void store(const vboold4& mask, void* ptr, const vllong4& f) { +#if defined(__AVX512VL__) + _mm256_mask_store_epi64(ptr,mask,f); +#else + _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f)); +#endif + } + + static __forceinline vllong4 broadcast64bit(size_t v) { + return _mm256_set1_epi64x(v); + } + + static __forceinline size_t extract64bit(const vllong4& v) + { + return _mm_cvtsi128_si64(_mm256_castsi256_si128(v)); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline long long& operator [](size_t index) { assert(index < 4); return i[index]; } + __forceinline const long long& operator [](size_t index) const { assert(index < 4); return i[index]; } + + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong4 select(const vboold4& m, const vllong4& t, const vllong4& f) { + #if defined(__AVX512VL__) + return _mm256_mask_blend_epi64(m, f, t); + #else + return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(f), _mm256_castsi256_pd(t), m)); + #endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboold4 asBool(const vllong4& a) { return _mm256_movepi64_mask(a); } +#else + __forceinline vboold4 asBool(const vllong4& a) { return _mm256_castsi256_pd(a); } +#endif + + __forceinline vllong4 operator +(const vllong4& a) { return a; } + __forceinline vllong4 operator -(const vllong4& a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong4 operator +(const vllong4& a, const vllong4& b) { return _mm256_add_epi64(a, b); } + __forceinline vllong4 operator +(const vllong4& a, long long b) { return a + vllong4(b); } + __forceinline vllong4 operator +(long long a, const vllong4& b) { return vllong4(a) + b; } + + __forceinline vllong4 operator -(const vllong4& a, const vllong4& b) { return _mm256_sub_epi64(a, b); } + __forceinline vllong4 operator -(const vllong4& a, long long b) { return a - vllong4(b); } + __forceinline vllong4 operator -(long long a, const vllong4& b) { return vllong4(a) - b; } + + /* only low 32bit part */ + __forceinline vllong4 operator *(const vllong4& a, const vllong4& b) { return _mm256_mul_epi32(a, b); } + __forceinline vllong4 operator *(const vllong4& a, long long b) { return a * vllong4(b); } + __forceinline vllong4 operator *(long long a, const vllong4& b) { return vllong4(a) * b; } + + __forceinline vllong4 operator &(const vllong4& a, const vllong4& b) { return _mm256_and_si256(a, b); } + __forceinline vllong4 operator &(const vllong4& a, long long b) { return a & vllong4(b); } + __forceinline vllong4 operator &(long long a, const vllong4& b) { return vllong4(a) & b; } + + __forceinline vllong4 operator |(const vllong4& a, const vllong4& b) { return _mm256_or_si256(a, b); } + __forceinline vllong4 operator |(const vllong4& a, long long b) { return a | vllong4(b); } + __forceinline vllong4 operator |(long long a, const vllong4& b) { return vllong4(a) | b; } + + __forceinline vllong4 operator ^(const vllong4& a, const vllong4& b) { return _mm256_xor_si256(a, b); } + __forceinline vllong4 operator ^(const vllong4& a, long long b) { return a ^ vllong4(b); } + __forceinline vllong4 operator ^(long long a, const vllong4& b) { return vllong4(a) ^ b; } + + __forceinline vllong4 operator <<(const vllong4& a, long long n) { return _mm256_slli_epi64(a, (int)n); } + //__forceinline vllong4 operator >>(const vllong4& a, long long n) { return _mm256_srai_epi64(a, n); } + + __forceinline vllong4 operator <<(const vllong4& a, const vllong4& n) { return _mm256_sllv_epi64(a, n); } + //__forceinline vllong4 operator >>(const vllong4& a, const vllong4& n) { return _mm256_srav_epi64(a, n); } + //__forceinline vllong4 sra(const vllong4& a, long long b) { return _mm256_srai_epi64(a, b); } + + __forceinline vllong4 srl(const vllong4& a, long long b) { return _mm256_srli_epi64(a, (int)b); } + + //__forceinline vllong4 min(const vllong4& a, const vllong4& b) { return _mm256_min_epi64(a, b); } + //__forceinline vllong4 min(const vllong4& a, long long b) { return min(a,vllong4(b)); } + //__forceinline vllong4 min(long long a, const vllong4& b) { return min(vllong4(a),b); } + + //__forceinline vllong4 max(const vllong4& a, const vllong4& b) { return _mm256_max_epi64(a, b); } + //__forceinline vllong4 max(const vllong4& a, long long b) { return max(a,vllong4(b)); } + //__forceinline vllong4 max(long long a, const vllong4& b) { return max(vllong4(a),b); } + +#if defined(__AVX512VL__) + __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_and_epi64(c,m,a,b); } + __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_or_epi64(c,m,a,b); } +#else + __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a & b, c); } + __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a | b, c); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong4& operator +=(vllong4& a, const vllong4& b) { return a = a + b; } + __forceinline vllong4& operator +=(vllong4& a, long long b) { return a = a + b; } + + __forceinline vllong4& operator -=(vllong4& a, const vllong4& b) { return a = a - b; } + __forceinline vllong4& operator -=(vllong4& a, long long b) { return a = a - b; } + + __forceinline vllong4& operator *=(vllong4& a, const vllong4& b) { return a = a * b; } + __forceinline vllong4& operator *=(vllong4& a, long long b) { return a = a * b; } + + __forceinline vllong4& operator &=(vllong4& a, const vllong4& b) { return a = a & b; } + __forceinline vllong4& operator &=(vllong4& a, long long b) { return a = a & b; } + + __forceinline vllong4& operator |=(vllong4& a, const vllong4& b) { return a = a | b; } + __forceinline vllong4& operator |=(vllong4& a, long long b) { return a = a | b; } + + __forceinline vllong4& operator <<=(vllong4& a, long long b) { return a = a << b; } + //__forceinline vllong4& operator >>=(vllong4& a, long long b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } +#else + __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmpeq_epi64(a,b); } + __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return !(a == b); } + __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(a,b); } + __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(b,a); } + __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return !(a < b); } + __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return !(a > b); } +#endif + + __forceinline vboold4 operator ==(const vllong4& a, long long b) { return a == vllong4(b); } + __forceinline vboold4 operator ==(long long a, const vllong4& b) { return vllong4(a) == b; } + + __forceinline vboold4 operator !=(const vllong4& a, long long b) { return a != vllong4(b); } + __forceinline vboold4 operator !=(long long a, const vllong4& b) { return vllong4(a) != b; } + + __forceinline vboold4 operator > (const vllong4& a, long long b) { return a > vllong4(b); } + __forceinline vboold4 operator > (long long a, const vllong4& b) { return vllong4(a) > b; } + + __forceinline vboold4 operator < (const vllong4& a, long long b) { return a < vllong4(b); } + __forceinline vboold4 operator < (long long a, const vllong4& b) { return vllong4(a) < b; } + + __forceinline vboold4 operator >=(const vllong4& a, long long b) { return a >= vllong4(b); } + __forceinline vboold4 operator >=(long long a, const vllong4& b) { return vllong4(a) >= b; } + + __forceinline vboold4 operator <=(const vllong4& a, long long b) { return a <= vllong4(b); } + __forceinline vboold4 operator <=(long long a, const vllong4& b) { return vllong4(a) <= b; } + + __forceinline vboold4 eq(const vllong4& a, const vllong4& b) { return a == b; } + __forceinline vboold4 ne(const vllong4& a, const vllong4& b) { return a != b; } + __forceinline vboold4 lt(const vllong4& a, const vllong4& b) { return a < b; } + __forceinline vboold4 ge(const vllong4& a, const vllong4& b) { return a >= b; } + __forceinline vboold4 gt(const vllong4& a, const vllong4& b) { return a > b; } + __forceinline vboold4 le(const vllong4& a, const vllong4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a == b); } + __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a != b); } + __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a < b); } + __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >= b); } + __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a > b); } + __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <= b); } +#endif + + __forceinline void xchg(const vboold4& m, vllong4& a, vllong4& b) { + const vllong4 c = a; a = select(m,b,a); b = select(m,c,b); + } + + __forceinline vboold4 test(const vllong4& a, const vllong4& b) { +#if defined(__AVX512VL__) + return _mm256_test_epi64_mask(a,b); +#else + return _mm256_testz_si256(a,b); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vllong4 shuffle(const vllong4& v) { + return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(v), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0)); + } + + template + __forceinline vllong4 shuffle(const vllong4& v) { + return shuffle(v); + } + + template + __forceinline vllong4 shuffle2(const vllong4& v) { + return _mm256_castpd_si256(_mm256_permute2f128_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(v), (i1 << 4) | i0)); + } + + __forceinline long long toScalar(const vllong4& v) { + return _mm_cvtsi128_si64(_mm256_castsi256_si128(v)); + } + +#if defined(__AVX512VL__) + __forceinline vllong4 permute(const vllong4& a, const __m256i& index) { + // workaround for GCC 7.x +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) + return _mm256_permutex2var_epi64(a,index,a); +#else + return _mm256_permutexvar_epi64(index,a); +#endif + } + + __forceinline vllong4 permutex2var(const vllong4& index, const vllong4& a, const vllong4& b) { + return _mm256_permutex2var_epi64(a,index,b); + } + +#endif + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + + __forceinline vllong4 vreduce_and2(const vllong4& x) { return x & shuffle<1,0>(x); } + __forceinline vllong4 vreduce_and (const vllong4& y) { const vllong4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); } + + __forceinline vllong4 vreduce_or2(const vllong4& x) { return x | shuffle<1,0>(x); } + __forceinline vllong4 vreduce_or (const vllong4& y) { const vllong4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); } + + __forceinline vllong4 vreduce_add2(const vllong4& x) { return x + shuffle<1,0>(x); } + __forceinline vllong4 vreduce_add (const vllong4& y) { const vllong4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); } + + __forceinline long long reduce_add(const vllong4& a) { return toScalar(vreduce_add(a)); } + __forceinline long long reduce_or (const vllong4& a) { return toScalar(vreduce_or(a)); } + __forceinline long long reduce_and(const vllong4& a) { return toScalar(vreduce_and(a)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vllong4& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<4; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h b/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h new file mode 100644 index 00000000000..76dddd8991d --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h @@ -0,0 +1,381 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX-512 64-bit long long type */ + template<> + struct vllong<8> + { + ALIGNED_STRUCT_(64); + + typedef vboold8 Bool; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m512i v; + long long i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong() {} + __forceinline vllong(const vllong8& t) { v = t.v; } + __forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; } + + __forceinline vllong(const __m512i& t) { v = t; } + __forceinline operator __m512i() const { return v; } + __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } + + __forceinline vllong(long long i) { + v = _mm512_set1_epi64(i); + } + + __forceinline vllong(long long a, long long b, long long c, long long d) { + v = _mm512_set4_epi64(d,c,b,a); + } + + __forceinline vllong(long long a0, long long a1, long long a2, long long a3, + long long a4, long long a5, long long a6, long long a7) + { + v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0); + } + + __forceinline vllong(const vllong<4>& i) { + v = _mm512_broadcast_i64x4(i); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong(ZeroTy) : v(_mm512_setzero_epi32()) {} + __forceinline vllong(OneTy) : v(_mm512_set1_epi64(1)) {} + __forceinline vllong(StepTy) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {} + __forceinline vllong(ReverseStepTy) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) { + _mm512_stream_si512((__m512i*)ptr,a); + } + + static __forceinline vllong8 loadu(const void* addr) { + return _mm512_loadu_si512(addr); + } + + static __forceinline vllong8 load(const vllong8* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vllong8 load(const long long* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vllong8 load(const uint8_t* ptr) { + return _mm512_cvtepu8_epi64(*(__m128i*)ptr); + } + + static __forceinline void store(void* ptr, const vllong8& v) { + _mm512_store_si512(ptr,v); + } + + static __forceinline void storeu(void* ptr, const vllong8& v) { + _mm512_storeu_si512(ptr,v); + } + + static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) { + _mm512_mask_storeu_epi64(ptr,mask,f); + } + + static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) { + _mm512_mask_store_epi64(addr,mask,v2); + } + + /* pass by value to avoid compiler generating inefficient code */ + static __forceinline void storeu_compact(const vboold8 mask, void* addr, const vllong8& reg) { + _mm512_mask_compressstoreu_epi64(addr,mask,reg); + } + + static __forceinline vllong8 compact64bit(const vboold8& mask, vllong8& v) { + return _mm512_mask_compress_epi64(v,mask,v); + } + + static __forceinline vllong8 compact64bit(const vboold8& mask, vllong8& dest, const vllong8& source) { + return _mm512_mask_compress_epi64(dest,mask,source); + } + + static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) { + return _mm512_mask_compress_epi64(v,mask,v); + } + + static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) { + return _mm512_mask_compress_epi64(a,mask,b); + } + + static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) { + return _mm512_mask_expand_epi64(b,mask,a); + } + + static __forceinline vllong8 broadcast64bit(size_t v) { + return _mm512_set1_epi64(v); + } + + static __forceinline size_t extract64bit(const vllong8& v) + { + return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline long long& operator [](size_t index) { assert(index < 8); return i[index]; } + __forceinline const long long& operator [](size_t index) const { assert(index < 8); return i[index]; } + + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); } + + __forceinline vllong8 operator +(const vllong8& a) { return a; } + __forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); } + __forceinline vllong8 operator +(const vllong8& a, long long b) { return a + vllong8(b); } + __forceinline vllong8 operator +(long long a, const vllong8& b) { return vllong8(a) + b; } + + __forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); } + __forceinline vllong8 operator -(const vllong8& a, long long b) { return a - vllong8(b); } + __forceinline vllong8 operator -(long long a, const vllong8& b) { return vllong8(a) - b; } + + __forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); } + __forceinline vllong8 operator *(const vllong8& a, long long b) { return a * vllong8(b); } + __forceinline vllong8 operator *(long long a, const vllong8& b) { return vllong8(a) * b; } + + __forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); } + __forceinline vllong8 operator &(const vllong8& a, long long b) { return a & vllong8(b); } + __forceinline vllong8 operator &(long long a, const vllong8& b) { return vllong8(a) & b; } + + __forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); } + __forceinline vllong8 operator |(const vllong8& a, long long b) { return a | vllong8(b); } + __forceinline vllong8 operator |(long long a, const vllong8& b) { return vllong8(a) | b; } + + __forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); } + __forceinline vllong8 operator ^(const vllong8& a, long long b) { return a ^ vllong8(b); } + __forceinline vllong8 operator ^(long long a, const vllong8& b) { return vllong8(a) ^ b; } + + __forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); } + __forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); } + + __forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); } + __forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); } + + __forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); } + __forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); } + __forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); } + + __forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); } + __forceinline vllong8 min(const vllong8& a, long long b) { return min(a,vllong8(b)); } + __forceinline vllong8 min(long long a, const vllong8& b) { return min(vllong8(a),b); } + + __forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); } + __forceinline vllong8 max(const vllong8& a, long long b) { return max(a,vllong8(b)); } + __forceinline vllong8 max(long long a, const vllong8& b) { return max(vllong8(a),b); } + + __forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); } + __forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); } + + __forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); } + __forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8& operator +=(vllong8& a, const vllong8& b) { return a = a + b; } + __forceinline vllong8& operator +=(vllong8& a, long long b) { return a = a + b; } + + __forceinline vllong8& operator -=(vllong8& a, const vllong8& b) { return a = a - b; } + __forceinline vllong8& operator -=(vllong8& a, long long b) { return a = a - b; } + + __forceinline vllong8& operator *=(vllong8& a, const vllong8& b) { return a = a * b; } + __forceinline vllong8& operator *=(vllong8& a, long long b) { return a = a * b; } + + __forceinline vllong8& operator &=(vllong8& a, const vllong8& b) { return a = a & b; } + __forceinline vllong8& operator &=(vllong8& a, long long b) { return a = a & b; } + + __forceinline vllong8& operator |=(vllong8& a, const vllong8& b) { return a = a | b; } + __forceinline vllong8& operator |=(vllong8& a, long long b) { return a = a | b; } + + __forceinline vllong8& operator <<=(vllong8& a, long long b) { return a = a << b; } + __forceinline vllong8& operator >>=(vllong8& a, long long b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 operator ==(const vllong8& a, long long b) { return a == vllong8(b); } + __forceinline vboold8 operator ==(long long a, const vllong8& b) { return vllong8(a) == b; } + + __forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 operator !=(const vllong8& a, long long b) { return a != vllong8(b); } + __forceinline vboold8 operator !=(long long a, const vllong8& b) { return vllong8(a) != b; } + + __forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 operator < (const vllong8& a, long long b) { return a < vllong8(b); } + __forceinline vboold8 operator < (long long a, const vllong8& b) { return vllong8(a) < b; } + + __forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 operator >=(const vllong8& a, long long b) { return a >= vllong8(b); } + __forceinline vboold8 operator >=(long long a, const vllong8& b) { return vllong8(a) >= b; } + + __forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 operator > (const vllong8& a, long long b) { return a > vllong8(b); } + __forceinline vboold8 operator > (long long a, const vllong8& b) { return vllong8(a) > b; } + + __forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboold8 operator <=(const vllong8& a, long long b) { return a <= vllong8(b); } + __forceinline vboold8 operator <=(long long a, const vllong8& b) { return vllong8(a) <= b; } + + __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); } + + __forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) { + return _mm512_mask_or_epi64(f,m,t,t); + } + + __forceinline void xchg(const vboold8& m, vllong8& a, vllong8& b) { + const vllong8 c = a; a = select(m,b,a); b = select(m,c,b); + } + + __forceinline vboold8 test(const vboold8& m, const vllong8& a, const vllong8& b) { + return _mm512_mask_test_epi64_mask(m,a,b); + } + + __forceinline vboold8 test(const vllong8& a, const vllong8& b) { + return _mm512_test_epi64_mask(a,b); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vllong8 shuffle(const vllong8& v) { + return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0)); + } + + template + __forceinline vllong8 shuffle(const vllong8& v) { + return shuffle(v); + } + + template + __forceinline vllong8 shuffle(const vllong8& v) { + return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + + template + __forceinline vllong8 shuffle4(const vllong8& v) { + return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2)); + } + + template + __forceinline vllong8 shuffle4(const vllong8& v) { + return shuffle4(v); + } + + template + __forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) { + return _mm512_alignr_epi64(a, b, i); + }; + + __forceinline long long toScalar(const vllong8& v) { + return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); + } + + __forceinline vllong8 zeroExtend32Bit(const __m512i& a) { + return _mm512_cvtepu32_epi64(_mm512_castsi512_si256(a)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8 vreduce_min2(vllong8 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); } + + __forceinline vllong8 vreduce_max2(vllong8 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); } + + __forceinline vllong8 vreduce_and2(vllong8 x) { return x & shuffle<1,0,3,2>(x); } + __forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } + __forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); } + + __forceinline vllong8 vreduce_or2(vllong8 x) { return x | shuffle<1,0,3,2>(x); } + __forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } + __forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); } + + __forceinline vllong8 vreduce_add2(vllong8 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); } + + __forceinline long long reduce_min(const vllong8& v) { return toScalar(vreduce_min(v)); } + __forceinline long long reduce_max(const vllong8& v) { return toScalar(vreduce_max(v)); } + __forceinline long long reduce_and(const vllong8& v) { return toScalar(vreduce_and(v)); } + __forceinline long long reduce_or (const vllong8& v) { return toScalar(vreduce_or (v)); } + __forceinline long long reduce_add(const vllong8& v) { return toScalar(vreduce_add(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vllong8 permute(const vllong8& v, const vllong8& index) { + return _mm512_permutexvar_epi64(index,v); + } + + __forceinline vllong8 reverse(const vllong8& a) { + return permute(a,vllong8(reverse_step)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vllong8& v) + { + cout << "<" << v[0]; + for (size_t i=1; i<8; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h b/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h new file mode 100644 index 00000000000..39752611bbb --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h @@ -0,0 +1,443 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 16-wide AVX-512 unsigned integer type */ + template<> + struct vuint<16> + { + ALIGNED_STRUCT_(64); + + typedef vboolf16 Bool; + typedef vuint16 UInt; + typedef vfloat16 Float; + + enum { size = 16 }; // number of SIMD elements + union { // data + __m512i v; + unsigned int i[16]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint16& t) { v = t.v; } + __forceinline vuint16& operator =(const vuint16& f) { v = f.v; return *this; } + + __forceinline vuint(const __m512i& t) { v = t; } + __forceinline operator __m512i() const { return v; } + __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); } + + __forceinline vuint(unsigned int i) { + v = _mm512_set1_epi32(i); + } + + __forceinline vuint(const vuint4& i) { + v = _mm512_broadcast_i32x4(i); + } + + __forceinline vuint(const vuint8& i) { + v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i)))); + } + + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) { + v = _mm512_set4_epi32(d,c,b,a); + } + + __forceinline vuint(unsigned int a0 , unsigned int a1 , unsigned int a2 , unsigned int a3, + unsigned int a4 , unsigned int a5 , unsigned int a6 , unsigned int a7, + unsigned int a8 , unsigned int a9 , unsigned int a10, unsigned int a11, + unsigned int a12, unsigned int a13, unsigned int a14, unsigned int a15) + { + v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0); + } + + __forceinline explicit vuint(const __m512& f) { + v = _mm512_cvtps_epu32(f); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm512_setzero_epi32()) {} + __forceinline vuint(OneTy) : v(_mm512_set1_epi32(1)) {} + __forceinline vuint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + __forceinline vuint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline void store_nt(void* __restrict__ ptr, const vuint16& a) { + _mm512_stream_si512((__m512i*)ptr,a); + } + + static __forceinline vuint16 loadu(const void* addr) + { + return _mm512_loadu_si512(addr); + } + + static __forceinline vuint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); } + static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); } + + static __forceinline vuint16 load(const vuint16* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vuint16 load(const unsigned int* addr) { + return _mm512_load_si512(addr); + } + + static __forceinline vuint16 load(unsigned short* ptr) { return _mm512_cvtepu16_epi32(*(__m256i*)ptr); } + + + static __forceinline void store(void* ptr, const vuint16& v) { + _mm512_store_si512(ptr,v); + } + + static __forceinline void storeu(void* ptr, const vuint16& v) { + _mm512_storeu_si512(ptr,v); + } + + static __forceinline void storeu(const vboolf16& mask, void* ptr, const vuint16& f) { + _mm512_mask_storeu_epi32(ptr,mask,f); + } + + static __forceinline void store(const vboolf16& mask, void* addr, const vuint16& v2) { + _mm512_mask_store_epi32(addr,mask,v2); + } + + /* pass by value to avoid compiler generating inefficient code */ + static __forceinline void storeu_compact(const vboolf16 mask, void* addr, const vuint16 reg) { + _mm512_mask_compressstoreu_epi32(addr,mask,reg); + } + + static __forceinline void storeu_compact_single(const vboolf16 mask, void* addr, vuint16 reg) { + //_mm512_mask_compressstoreu_epi32(addr,mask,reg); + *(float*)addr = mm512_cvtss_f32(_mm512_mask_compress_ps(_mm512_castsi512_ps(reg),mask,_mm512_castsi512_ps(reg))); + } + + static __forceinline vuint16 compact64bit(const vboolf16& mask, vuint16& v) { + return _mm512_mask_compress_epi64(v,mask,v); + } + + static __forceinline vuint16 compact(const vboolf16& mask, vuint16& v) { + return _mm512_mask_compress_epi32(v,mask,v); + } + + static __forceinline vuint16 compact(const vboolf16& mask, const vuint16& a, vuint16& b) { + return _mm512_mask_compress_epi32(a,mask,b); + } + + static __forceinline vuint16 expand(const vboolf16& mask, const vuint16& a, vuint16& b) { + return _mm512_mask_expand_epi32(b,mask,a); + } + + template + static __forceinline vuint16 gather(const unsigned int* ptr, const vint16& index) { + return _mm512_i32gather_epi32(index,ptr,scale); + } + + template + static __forceinline vuint16 gather(const vboolf16& mask, const unsigned int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale); + } + + template + static __forceinline vuint16 gather(const vboolf16& mask, vuint16& dest, const unsigned int* ptr, const vint16& index) { + return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale); + } + + template + static __forceinline void scatter(unsigned int* ptr, const vint16& index, const vuint16& v) { + _mm512_i32scatter_epi32((int*)ptr,index,v,scale); + } + + template + static __forceinline void scatter(const vboolf16& mask, unsigned int* ptr, const vint16& index, const vuint16& v) { + _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale); + } + + static __forceinline vuint16 broadcast64bit(size_t v) { + return _mm512_set1_epi64(v); + } + + static __forceinline size_t extract64bit(const vuint16& v) + { + return _mm_cvtsi128_si64(_mm512_castsi512_si128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline unsigned int& operator [](size_t index) { assert(index < 16); return i[index]; } + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 16); return i[index]; } + + __forceinline unsigned int uint (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; } + __forceinline size_t& uint64_t(size_t index) const { assert(index < 8); return ((size_t*)i)[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 asBool(const vuint16& a) { return _mm512_movepi32_mask(a); } + + __forceinline vuint16 operator +(const vuint16& a) { return a; } + __forceinline vuint16 operator -(const vuint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16 operator +(const vuint16& a, const vuint16& b) { return _mm512_add_epi32(a, b); } + __forceinline vuint16 operator +(const vuint16& a, unsigned int b) { return a + vuint16(b); } + __forceinline vuint16 operator +(unsigned int a, const vuint16& b) { return vuint16(a) + b; } + + __forceinline vuint16 operator -(const vuint16& a, const vuint16& b) { return _mm512_sub_epi32(a, b); } + __forceinline vuint16 operator -(const vuint16& a, unsigned int b) { return a - vuint16(b); } + __forceinline vuint16 operator -(unsigned int a, const vuint16& b) { return vuint16(a) - b; } + + __forceinline vuint16 operator *(const vuint16& a, const vuint16& b) { return _mm512_mul_epu32(a, b); } + __forceinline vuint16 operator *(const vuint16& a, unsigned int b) { return a * vuint16(b); } + __forceinline vuint16 operator *(unsigned int a, const vuint16& b) { return vuint16(a) * b; } + + __forceinline vuint16 operator &(const vuint16& a, const vuint16& b) { return _mm512_and_epi32(a, b); } + __forceinline vuint16 operator &(const vuint16& a, unsigned int b) { return a & vuint16(b); } + __forceinline vuint16 operator &(unsigned int a, const vuint16& b) { return vuint16(a) & b; } + + __forceinline vuint16 operator |(const vuint16& a, const vuint16& b) { return _mm512_or_epi32(a, b); } + __forceinline vuint16 operator |(const vuint16& a, unsigned int b) { return a | vuint16(b); } + __forceinline vuint16 operator |(unsigned int a, const vuint16& b) { return vuint16(a) | b; } + + __forceinline vuint16 operator ^(const vuint16& a, const vuint16& b) { return _mm512_xor_epi32(a, b); } + __forceinline vuint16 operator ^(const vuint16& a, unsigned int b) { return a ^ vuint16(b); } + __forceinline vuint16 operator ^(unsigned int a, const vuint16& b) { return vuint16(a) ^ b; } + + __forceinline vuint16 operator <<(const vuint16& a, unsigned int n) { return _mm512_slli_epi32(a, n); } + __forceinline vuint16 operator >>(const vuint16& a, unsigned int n) { return _mm512_srli_epi32(a, n); } + + __forceinline vuint16 operator <<(const vuint16& a, const vuint16& n) { return _mm512_sllv_epi32(a, n); } + __forceinline vuint16 operator >>(const vuint16& a, const vuint16& n) { return _mm512_srlv_epi32(a, n); } + + __forceinline vuint16 sll (const vuint16& a, unsigned int b) { return _mm512_slli_epi32(a, b); } + __forceinline vuint16 sra (const vuint16& a, unsigned int b) { return _mm512_srai_epi32(a, b); } + __forceinline vuint16 srl (const vuint16& a, unsigned int b) { return _mm512_srli_epi32(a, b); } + + __forceinline vuint16 min(const vuint16& a, const vuint16& b) { return _mm512_min_epu32(a, b); } + __forceinline vuint16 min(const vuint16& a, unsigned int b) { return min(a,vuint16(b)); } + __forceinline vuint16 min(unsigned int a, const vuint16& b) { return min(vuint16(a),b); } + + __forceinline vuint16 max(const vuint16& a, const vuint16& b) { return _mm512_max_epu32(a, b); } + __forceinline vuint16 max(const vuint16& a, unsigned int b) { return max(a,vuint16(b)); } + __forceinline vuint16 max(unsigned int a, const vuint16& b) { return max(vuint16(a),b); } + + __forceinline vuint16 mask_add(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); } + __forceinline vuint16 mask_sub(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); } + + __forceinline vuint16 mask_and(const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_and_epi32(c,m,a,b); } + __forceinline vuint16 mask_or (const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_or_epi32(c,m,a,b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16& operator +=(vuint16& a, const vuint16& b) { return a = a + b; } + __forceinline vuint16& operator +=(vuint16& a, unsigned int b) { return a = a + b; } + + __forceinline vuint16& operator -=(vuint16& a, const vuint16& b) { return a = a - b; } + __forceinline vuint16& operator -=(vuint16& a, unsigned int b) { return a = a - b; } + + __forceinline vuint16& operator *=(vuint16& a, const vuint16& b) { return a = a * b; } + __forceinline vuint16& operator *=(vuint16& a, unsigned int b) { return a = a * b; } + + __forceinline vuint16& operator &=(vuint16& a, const vuint16& b) { return a = a & b; } + __forceinline vuint16& operator &=(vuint16& a, unsigned int b) { return a = a & b; } + + __forceinline vuint16& operator |=(vuint16& a, const vuint16& b) { return a = a | b; } + __forceinline vuint16& operator |=(vuint16& a, unsigned int b) { return a = a | b; } + + __forceinline vuint16& operator <<=(vuint16& a, unsigned int b) { return a = a << b; } + __forceinline vuint16& operator >>=(vuint16& a, unsigned int b) { return a = a >> b; } + + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf16 operator ==(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 operator ==(const vuint16& a, unsigned int b) { return a == vuint16(b); } + __forceinline vboolf16 operator ==(unsigned int a, const vuint16& b) { return vuint16(a) == b; } + + __forceinline vboolf16 operator !=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 operator !=(const vuint16& a, unsigned int b) { return a != vuint16(b); } + __forceinline vboolf16 operator !=(unsigned int a, const vuint16& b) { return vuint16(a) != b; } + + __forceinline vboolf16 operator < (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 operator < (const vuint16& a, unsigned int b) { return a < vuint16(b); } + __forceinline vboolf16 operator < (unsigned int a, const vuint16& b) { return vuint16(a) < b; } + + __forceinline vboolf16 operator >=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 operator >=(const vuint16& a, unsigned int b) { return a >= vuint16(b); } + __forceinline vboolf16 operator >=(unsigned int a, const vuint16& b) { return vuint16(a) >= b; } + + __forceinline vboolf16 operator > (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 operator > (const vuint16& a, unsigned int b) { return a > vuint16(b); } + __forceinline vboolf16 operator > (unsigned int a, const vuint16& b) { return vuint16(a) > b; } + + __forceinline vboolf16 operator <=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + __forceinline vboolf16 operator <=(const vuint16& a, unsigned int b) { return a <= vuint16(b); } + __forceinline vboolf16 operator <=(unsigned int a, const vuint16& b) { return vuint16(a) <= b; } + + __forceinline vboolf16 eq(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vboolf16 eq(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_EQ); } + __forceinline vboolf16 ne(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_NE); } + __forceinline vboolf16 lt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LT); } + __forceinline vboolf16 ge(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GE); } + __forceinline vboolf16 gt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); } + __forceinline vboolf16 le(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); } + + + __forceinline vuint16 select(const vboolf16& m, const vuint16& t, const vuint16& f) { + return _mm512_mask_or_epi32(f,m,t,t); + } + + __forceinline void xchg(const vboolf16& m, vuint16& a, vuint16& b) { + const vuint16 c = a; a = select(m,b,a); b = select(m,c,b); + } + + __forceinline vboolf16 test(const vboolf16& m, const vuint16& a, const vuint16& b) { + return _mm512_mask_test_epi32_mask(m,a,b); + } + + __forceinline vboolf16 test(const vuint16& a, const vuint16& b) { + return _mm512_test_epi32_mask(a,b); + } + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vuint16 shuffle(const vuint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vuint16 shuffle(const vuint16& v) { + return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vuint16 shuffle4(const vuint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v) ,_MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vuint16 shuffle4(const vuint16& v) { + return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vuint16 align_shift_right(const vuint16& a, const vuint16& b) { + return _mm512_alignr_epi32(a, b, i); + }; + + __forceinline unsigned int toScalar(const vuint16& v) { + return _mm_cvtsi128_si32(_mm512_castsi512_si128(v)); + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16 vreduce_min2(vuint16 x) { return min(x, shuffle<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_min4(vuint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); } + __forceinline vuint16 vreduce_min8(vuint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_min (vuint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vuint16 vreduce_max2(vuint16 x) { return max(x, shuffle<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_max4(vuint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); } + __forceinline vuint16 vreduce_max8(vuint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); } + __forceinline vuint16 vreduce_max (vuint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); } + + __forceinline vuint16 vreduce_and2(vuint16 x) { return x & shuffle<1,0,3,2>(x); } + __forceinline vuint16 vreduce_and4(vuint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); } + __forceinline vuint16 vreduce_and8(vuint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); } + __forceinline vuint16 vreduce_and (vuint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); } + + __forceinline vuint16 vreduce_or2(vuint16 x) { return x | shuffle<1,0,3,2>(x); } + __forceinline vuint16 vreduce_or4(vuint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); } + __forceinline vuint16 vreduce_or8(vuint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); } + __forceinline vuint16 vreduce_or (vuint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); } + + __forceinline vuint16 vreduce_add2(vuint16 x) { return x + shuffle<1,0,3,2>(x); } + __forceinline vuint16 vreduce_add4(vuint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); } + __forceinline vuint16 vreduce_add8(vuint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); } + __forceinline vuint16 vreduce_add (vuint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); } + + __forceinline unsigned int reduce_min(const vuint16& v) { return toScalar(vreduce_min(v)); } + __forceinline unsigned int reduce_max(const vuint16& v) { return toScalar(vreduce_max(v)); } + __forceinline unsigned int reduce_and(const vuint16& v) { return toScalar(vreduce_and(v)); } + __forceinline unsigned int reduce_or (const vuint16& v) { return toScalar(vreduce_or (v)); } + __forceinline unsigned int reduce_add(const vuint16& v) { return toScalar(vreduce_add(v)); } + + //////////////////////////////////////////////////////////////////////////////// + /// Memory load and store operations + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint16 permute(vuint16 v, vuint16 index) { + return _mm512_permutexvar_epi32(index,v); + } + + __forceinline vuint16 reverse(const vuint16& a) { + return permute(a,vuint16(reverse_step)); + } + + __forceinline vuint16 prefix_sum(const vuint16& a) + { + const vuint16 z(zero); + vuint16 v = a; + v = v + align_shift_right<16-1>(v,z); + v = v + align_shift_right<16-2>(v,z); + v = v + align_shift_right<16-4>(v,z); + v = v + align_shift_right<16-8>(v,z); + return v; + } + + __forceinline vuint16 reverse_prefix_sum(const vuint16& a) + { + const vuint16 z(zero); + vuint16 v = a; + v = v + align_shift_right<1>(z,v); + v = v + align_shift_right<2>(z,v); + v = v + align_shift_right<4>(z,v); + v = v + align_shift_right<8>(z,v); + return v; + } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint16& v) + { + cout << "<" << v[0]; + for (int i=1; i<16; i++) cout << ", " << v[i]; + cout << ">"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h b/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h new file mode 100644 index 00000000000..a3f393ebf2f --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h @@ -0,0 +1,499 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../math/math.h" + +namespace embree +{ + /* 4-wide SSE integer type */ + template<> + struct vuint<4> + { + ALIGNED_STRUCT_(16); + + typedef vboolf4 Bool; + typedef vuint4 Int; + typedef vfloat4 Float; + + enum { size = 4 }; // number of SIMD elements + union { __m128i v; unsigned int i[4]; }; // data + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint4& a) { v = a.v; } + __forceinline vuint4& operator =(const vuint4& a) { v = a.v; return *this; } + + __forceinline vuint(const __m128i a) : v(a) {} + __forceinline operator const __m128i&() const { return v; } + __forceinline operator __m128i&() { return v; } + + + __forceinline vuint(unsigned int a) : v(_mm_set1_epi32(a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm_set_epi32(d, c, b, a)) {} + +#if defined(__AVX512VL__) + __forceinline explicit vuint(__m128 a) : v(_mm_cvtps_epu32(a)) {} +#endif + +#if defined(__AVX512VL__) + __forceinline explicit vuint(const vboolf4& a) : v(_mm_movm_epi32(a)) {} +#else + __forceinline explicit vuint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {} +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm_setzero_si128()) {} + __forceinline vuint(OneTy) : v(_mm_set1_epi32(1)) {} + __forceinline vuint(PosInfTy) : v(_mm_set1_epi32(unsigned(pos_inf))) {} + __forceinline vuint(StepTy) : v(_mm_set_epi32(3, 2, 1, 0)) {} + __forceinline vuint(TrueTy) { v = _mm_cmpeq_epi32(v,v); } + __forceinline vuint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vuint4 load (const void* a) { return _mm_load_si128((__m128i*)a); } + static __forceinline vuint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); } + + static __forceinline void store (void* ptr, const vuint4& v) { _mm_store_si128((__m128i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vuint4& v) { _mm_storeu_si128((__m128i*)ptr,v); } + +#if defined(__AVX512VL__) + static __forceinline vuint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); } + static __forceinline vuint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); } +#elif defined(__AVX__) + static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); } +#else + static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); } + static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); } + + static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { store (ptr,select(mask,i,load (ptr))); } + static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); } +#endif + +#if defined(__aarch64__) + static __forceinline vuint4 load(const uint8_t* ptr) { + return _mm_load4epu8_epi32(((__m128i*)ptr)); + } + static __forceinline vuint4 loadu(const uint8_t* ptr) { + return _mm_load4epu8_epi32(((__m128i*)ptr)); + } +#elif defined(__SSE4_1__) + static __forceinline vuint4 load(const uint8_t* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } + + static __forceinline vuint4 loadu(const uint8_t* ptr) { + return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); + } + +#endif + + static __forceinline vuint4 load(const unsigned short* ptr) { +#if defined(__aarch64__) + return _mm_load4epu16_epi32(((__m128i*)ptr)); +#elif defined (__SSE4_1__) + return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); +#else + return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]); +#endif + } + + static __forceinline void store_uint8(uint8_t* ptr, const vuint4& v) { +#if defined(__aarch64__) + uint32x4_t x = uint32x4_t(v.v); + uint16x4_t y = vqmovn_u32(x); + uint8x8_t z = vqmovn_u16(vcombine_u16(y, y)); + vst1_lane_u32((uint32_t *)ptr, uint32x2_t(z), 0); +#elif defined(__SSE4_1__) + __m128i x = v; + x = _mm_packus_epi32(x, x); + x = _mm_packus_epi16(x, x); + *(unsigned*)ptr = _mm_cvtsi128_si32(x); +#else + for (size_t i=0;i<4;i++) + ptr[i] = (uint8_t)v[i]; +#endif + } + + static __forceinline void store_uint8(unsigned short* ptr, const vuint4& v) { +#if defined(__aarch64__) + uint32x4_t x = (uint32x4_t)v.v; + uint16x4_t y = vqmovn_u32(x); + vst1_u16(ptr, y); +#else + for (size_t i=0;i<4;i++) + ptr[i] = (unsigned short)v[i]; +#endif + } + + static __forceinline vuint4 load_nt(void* ptr) { +#if (defined(__aarch64__)) || defined(__SSE4_1__) + return _mm_stream_load_si128((__m128i*)ptr); +#else + return _mm_load_si128((__m128i*)ptr); +#endif + } + + static __forceinline void store_nt(void* ptr, const vuint4& v) { +#if !defined(__aarch64__) && defined(__SSE4_1__) + _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); +#else + _mm_store_si128((__m128i*)ptr,v); +#endif + } + + template + static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) { +#if defined(__AVX2__) && !defined(__aarch64__) + return _mm_i32gather_epi32((const int*)ptr, index, scale); +#else + return vuint4( + *(unsigned int*)(((int8_t*)ptr)+scale*index[0]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[1]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[2]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[3])); +#endif + } + + template + static __forceinline vuint4 gather(const vboolf4& mask, const unsigned int* ptr, const vint4& index) { + vuint4 r = zero; +#if defined(__AVX512VL__) + return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); +#elif defined(__AVX2__) && !defined(__aarch64__) + return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale); +#else + if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]); + return r; +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 4); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < 4); return i[index]; } + + friend __forceinline vuint4 select(const vboolf4& m, const vuint4& t, const vuint4& f) { +#if defined(__AVX512VL__) + return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t); +#elif defined(__SSE4_1__) + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); +#endif + } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 asBool(const vuint4& a) { return _mm_movepi32_mask(a); } +#else + __forceinline vboolf4 asBool(const vuint4& a) { return _mm_castsi128_ps(a); } +#endif + + __forceinline vuint4 operator +(const vuint4& a) { return a; } + __forceinline vuint4 operator -(const vuint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint4 operator +(const vuint4& a, const vuint4& b) { return _mm_add_epi32(a, b); } + __forceinline vuint4 operator +(const vuint4& a, unsigned int b) { return a + vuint4(b); } + __forceinline vuint4 operator +(unsigned int a, const vuint4& b) { return vuint4(a) + b; } + + __forceinline vuint4 operator -(const vuint4& a, const vuint4& b) { return _mm_sub_epi32(a, b); } + __forceinline vuint4 operator -(const vuint4& a, unsigned int b) { return a - vuint4(b); } + __forceinline vuint4 operator -(unsigned int a, const vuint4& b) { return vuint4(a) - b; } + +//#if defined(__SSE4_1__) +// __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return _mm_mullo_epu32(a, b); } +//#else +// __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return vuint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); } +//#endif +// __forceinline vuint4 operator *(const vuint4& a, unsigned int b) { return a * vuint4(b); } +// __forceinline vuint4 operator *(unsigned int a, const vuint4& b) { return vuint4(a) * b; } + + __forceinline vuint4 operator &(const vuint4& a, const vuint4& b) { return _mm_and_si128(a, b); } + __forceinline vuint4 operator &(const vuint4& a, unsigned int b) { return a & vuint4(b); } + __forceinline vuint4 operator &(unsigned int a, const vuint4& b) { return vuint4(a) & b; } + + __forceinline vuint4 operator |(const vuint4& a, const vuint4& b) { return _mm_or_si128(a, b); } + __forceinline vuint4 operator |(const vuint4& a, unsigned int b) { return a | vuint4(b); } + __forceinline vuint4 operator |(unsigned int a, const vuint4& b) { return vuint4(a) | b; } + + __forceinline vuint4 operator ^(const vuint4& a, const vuint4& b) { return _mm_xor_si128(a, b); } + __forceinline vuint4 operator ^(const vuint4& a, unsigned int b) { return a ^ vuint4(b); } + __forceinline vuint4 operator ^(unsigned int a, const vuint4& b) { return vuint4(a) ^ b; } + + __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a, n); } + __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a, n); } + + __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a, b); } + __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a, b); } + __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a, b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint4& operator +=(vuint4& a, const vuint4& b) { return a = a + b; } + __forceinline vuint4& operator +=(vuint4& a, unsigned int b) { return a = a + b; } + + __forceinline vuint4& operator -=(vuint4& a, const vuint4& b) { return a = a - b; } + __forceinline vuint4& operator -=(vuint4& a, unsigned int b) { return a = a - b; } + +//#if defined(__SSE4_1__) +// __forceinline vuint4& operator *=(vuint4& a, const vuint4& b) { return a = a * b; } +// __forceinline vuint4& operator *=(vuint4& a, unsigned int b) { return a = a * b; } +//#endif + + __forceinline vuint4& operator &=(vuint4& a, const vuint4& b) { return a = a & b; } + __forceinline vuint4& operator &=(vuint4& a, unsigned int b) { return a = a & b; } + + __forceinline vuint4& operator |=(vuint4& a, const vuint4& b) { return a = a | b; } + __forceinline vuint4& operator |=(vuint4& a, unsigned int b) { return a = a | b; } + + __forceinline vuint4& operator <<=(vuint4& a, unsigned int b) { return a = a << b; } + __forceinline vuint4& operator >>=(vuint4& a, unsigned int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } +#else + __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } + __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return !(a == b); } + //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmplt_epu32(a, b)); } + //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return !(a < b); } + //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epu32(a, b)); } + //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return !(a > b); } +#endif + + __forceinline vboolf4 operator ==(const vuint4& a, unsigned int b) { return a == vuint4(b); } + __forceinline vboolf4 operator ==(unsigned int a, const vuint4& b) { return vuint4(a) == b; } + + __forceinline vboolf4 operator !=(const vuint4& a, unsigned int b) { return a != vuint4(b); } + __forceinline vboolf4 operator !=(unsigned int a, const vuint4& b) { return vuint4(a) != b; } + + //__forceinline vboolf4 operator < (const vuint4& a, unsigned int b) { return a < vuint4(b); } + //__forceinline vboolf4 operator < (unsigned int a, const vuint4& b) { return vuint4(a) < b; } + + //__forceinline vboolf4 operator >=(const vuint4& a, unsigned int b) { return a >= vuint4(b); } + //__forceinline vboolf4 operator >=(unsigned int a, const vuint4& b) { return vuint4(a) >= b; } + + //__forceinline vboolf4 operator > (const vuint4& a, unsigned int b) { return a > vuint4(b); } + //__forceinline vboolf4 operator > (unsigned int a, const vuint4& b) { return vuint4(a) > b; } + + //__forceinline vboolf4 operator <=(const vuint4& a, unsigned int b) { return a <= vuint4(b); } + //__forceinline vboolf4 operator <=(unsigned int a, const vuint4& b) { return vuint4(a) <= b; } + + __forceinline vboolf4 eq(const vuint4& a, const vuint4& b) { return a == b; } + __forceinline vboolf4 ne(const vuint4& a, const vuint4& b) { return a != b; } + //__forceinline vboolf4 lt(const vuint4& a, const vuint4& b) { return a < b; } + //__forceinline vboolf4 ge(const vuint4& a, const vuint4& b) { return a >= b; } + //__forceinline vboolf4 gt(const vuint4& a, const vuint4& b) { return a > b; } + //__forceinline vboolf4 le(const vuint4& a, const vuint4& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); } + //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); } + //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); } + //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); } + //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a == b); } + __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a != b); } + //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a < b); } + //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >= b); } + //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a > b); } + //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <= b); } +#endif + + template + __forceinline vuint4 select(const vuint4& t, const vuint4& f) { +#if defined(__SSE4_1__) + return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask)); +#else + return select(vboolf4(mask), t, f); +#endif + } + +/*#if defined(__SSE4_1__) + __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return _mm_min_epu32(a, b); } + __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return _mm_max_epu32(a, b); } + +#else + __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return select(a < b,a,b); } + __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return select(a < b,b,a); } +#endif + + __forceinline vuint4 min(const vuint4& a, unsigned int b) { return min(a,vuint4(b)); } + __forceinline vuint4 min(unsigned int a, const vuint4& b) { return min(vuint4(a),b); } + __forceinline vuint4 max(const vuint4& a, unsigned int b) { return max(a,vuint4(b)); } + __forceinline vuint4 max(unsigned int a, const vuint4& b) { return max(vuint4(a),b); }*/ + + //////////////////////////////////////////////////////////////////////////////// + // Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } + +#if defined(__aarch64__) + template + __forceinline vuint4 shuffle(const vuint4& v) { + return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); + } + template + __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) { + return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); + } +#else + template + __forceinline vuint4 shuffle(const vuint4& v) { + return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); + } + template + __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) { + return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } +#endif +#if defined(__SSE3__) + template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } + template<> __forceinline vuint4 shuffle<0, 1, 0, 1>(const vuint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); } +#endif + + template + __forceinline vuint4 shuffle(const vuint4& v) { + return shuffle(v); + } + +#if defined(__aarch64__) + template __forceinline unsigned int extract(const vuint4& b); + template __forceinline vuint4 insert(const vuint4& a, const unsigned b); +#elif defined(__SSE4_1__) + template __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); } + template __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); } +#else + template __forceinline unsigned int extract(const vuint4& b) { return b[src&3]; } + template __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; } +#endif + +#if defined(__aarch64__) + template<> __forceinline unsigned int extract<0>(const vuint4& b) { + return b[0]; + } + template<> __forceinline unsigned int extract<1>(const vuint4& b) { + return b[1]; + } + template<> __forceinline unsigned int extract<2>(const vuint4& b) { + return b[2]; + } + template<> __forceinline unsigned int extract<3>(const vuint4& b) { + return b[3]; + } + + template<> __forceinline vuint4 insert<0>(const vuint4& a, unsigned b){ + vuint4 c = a; + c[0] = b; + return c; + } + template<> __forceinline vuint4 insert<1>(const vuint4& a, unsigned b){ + vuint4 c = a; + c[1] = b; + return c; + } + template<> __forceinline vuint4 insert<2>(const vuint4& a, unsigned b){ + vuint4 c = a; + c[2] = b; + return c; + } + template<> __forceinline vuint4 insert<3>(const vuint4& a, unsigned b){ + vuint4 c = a; + c[3] = b; + return c; + } + + __forceinline unsigned int toScalar(const vuint4& v) { + return v[0]; + } +#else + template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); } + + __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + +#if 0 +#if defined(__SSE4_1__) + + __forceinline vuint4 vreduce_min(const vuint4& v) { vuint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } + __forceinline vuint4 vreduce_max(const vuint4& v) { vuint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } + __forceinline vuint4 vreduce_add(const vuint4& v) { vuint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } + + __forceinline unsigned int reduce_min(const vuint4& v) { return toScalar(vreduce_min(v)); } + __forceinline unsigned int reduce_max(const vuint4& v) { return toScalar(vreduce_max(v)); } + __forceinline unsigned int reduce_add(const vuint4& v) { return toScalar(vreduce_add(v)); } + + __forceinline size_t select_min(const vuint4& v) { return bsf(movemask(v == vreduce_min(v))); } + __forceinline size_t select_max(const vuint4& v) { return bsf(movemask(v == vreduce_max(v))); } + + //__forceinline size_t select_min(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + //__forceinline size_t select_max(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + +#else + + __forceinline unsigned int reduce_min(const vuint4& v) { return min(v[0],v[1],v[2],v[3]); } + __forceinline unsigned int reduce_max(const vuint4& v) { return max(v[0],v[1],v[2],v[3]); } + __forceinline unsigned int reduce_add(const vuint4& v) { return v[0]+v[1]+v[2]+v[3]; } + +#endif +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint4& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">"; + } +} + diff --git a/thirdparty/embree-aarch64/common/simd/vuint8_avx.h b/thirdparty/embree-aarch64/common/simd/vuint8_avx.h new file mode 100644 index 00000000000..d4e86ae92d7 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vuint8_avx.h @@ -0,0 +1,379 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vuint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vuint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + struct { __m128i vl,vh; }; + unsigned int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint8& a) { v = a.v; } + __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; } + + __forceinline vuint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vuint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {} + + __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {} + + __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vuint(OneTy) : v(_mm256_set1_epi32(1)) {} + __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(0xFFFFFFFF)) {} + __forceinline vuint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vuint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); } + static __forceinline vuint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); } + + static __forceinline vuint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + static __forceinline vuint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); } + + static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } + +#if !defined(__aarch64__) + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } +#else + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); } +#endif + static __forceinline void store_nt(void* ptr, const vuint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline vuint8 load(const uint8_t* ptr) { + vuint4 il = vuint4::load(ptr+0); + vuint4 ih = vuint4::load(ptr+4); + return vuint8(il,ih); + } + + static __forceinline vuint8 loadu(const uint8_t* ptr) { + vuint4 il = vuint4::loadu(ptr+0); + vuint4 ih = vuint4::loadu(ptr+4); + return vuint8(il,ih); + } + + static __forceinline vuint8 load(const unsigned short* ptr) { + vuint4 il = vuint4::load(ptr+0); + vuint4 ih = vuint4::load(ptr+4); + return vuint8(il,ih); + } + + static __forceinline vuint8 loadu(const unsigned short* ptr) { + vuint4 il = vuint4::loadu(ptr+0); + vuint4 ih = vuint4::loadu(ptr+4); + return vuint8(il,ih); + } + + static __forceinline void store(uint8_t* ptr, const vuint8& i) { + vuint4 il(i.vl); + vuint4 ih(i.vh); + vuint4::store(ptr + 0,il); + vuint4::store(ptr + 4,ih); + } + + static __forceinline void store(unsigned short* ptr, const vuint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template + static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) { + return vuint8( + *(unsigned int*)(((int8_t*)ptr)+scale*index[0]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[1]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[2]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[3]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[4]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[5]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[6]), + *(unsigned int*)(((int8_t*)ptr)+scale*index[7])); + } + + template + static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) { + vuint8 r = zero; + if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]); + if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]); + if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]); + if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]); + if (likely(mask[4])) r[4] = *(unsigned int*)(((int8_t*)ptr)+scale*index[4]); + if (likely(mask[5])) r[5] = *(unsigned int*)(((int8_t*)ptr)+scale*index[5]); + if (likely(mask[6])) r[6] = *(unsigned int*)(((int8_t*)ptr)+scale*index[6]); + if (likely(mask[7])) r[7] = *(unsigned int*)(((int8_t*)ptr)+scale*index[7]); + return r; + } + + template + static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v) + { + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; + } + + template + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v) + { + if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; + } + + + static __forceinline vuint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); } + + __forceinline vuint8 operator +(const vuint8& a) { return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return vuint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); } + __forceinline vuint8 operator +(const vuint8& a, unsigned int b) { return a + vuint8(b); } + __forceinline vuint8 operator +(unsigned int a, const vuint8& b) { return vuint8(a) + b; } + + __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return vuint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); } + __forceinline vuint8 operator -(const vuint8& a, unsigned int b) { return a - vuint8(b); } + __forceinline vuint8 operator -(unsigned int a, const vuint8& b) { return vuint8(a) - b; } + + //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return vuint8(_mm_mullo_epu32(a.vl, b.vl), _mm_mullo_epu32(a.vh, b.vh)); } + //__forceinline vuint8 operator *(const vuint8& a, unsigned int b) { return a * vuint8(b); } + //__forceinline vuint8 operator *(unsigned int a, const vuint8& b) { return vuint8(a) * b; } + + __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 operator &(const vuint8& a, unsigned int b) { return a & vuint8(b); } + __forceinline vuint8 operator &(unsigned int a, const vuint8& b) { return vuint8(a) & b; } + + __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 operator |(const vuint8& a, unsigned int b) { return a | vuint8(b); } + __forceinline vuint8 operator |(unsigned int a, const vuint8& b) { return vuint8(a) | b; } + + __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 operator ^(const vuint8& a, unsigned int b) { return a ^ vuint8(b); } + __forceinline vuint8 operator ^(unsigned int a, const vuint8& b) { return vuint8(a) ^ b; } + + __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return vuint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); } + __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return vuint8(_mm_srai_epi32(a.vl, n), _mm_srli_epi32(a.vh, n)); } + + __forceinline vuint8 sll (const vuint8& a, unsigned int b) { return vuint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); } + __forceinline vuint8 sra (const vuint8& a, unsigned int b) { return vuint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); } + __forceinline vuint8 srl (const vuint8& a, unsigned int b) { return vuint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); } + + __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return vuint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); } + __forceinline vuint8 min(const vuint8& a, unsigned int b) { return min(a,vuint8(b)); } + __forceinline vuint8 min(unsigned int a, const vuint8& b) { return min(vuint8(a),b); } + + __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return vuint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); } + __forceinline vuint8 max(const vuint8& a, unsigned int b) { return max(a,vuint8(b)); } + __forceinline vuint8 max(unsigned int a, const vuint8& b) { return max(vuint8(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; } + __forceinline vuint8& operator +=(vuint8& a, unsigned int b) { return a = a + b; } + + __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; } + __forceinline vuint8& operator -=(vuint8& a, unsigned int b) { return a = a - b; } + + //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; } + //__forceinline vuint8& operator *=(vuint8& a, unsigned int b) { return a = a * b; } + + __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; } + __forceinline vuint8& operator &=(vuint8& a, unsigned int b) { return a = a & b; } + + __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; } + __forceinline vuint8& operator |=(vuint8& a, unsigned int b) { return a = a | b; } + + __forceinline vuint8& operator <<=(vuint8& a, unsigned int b) { return a = a << b; } + __forceinline vuint8& operator >>=(vuint8& a, unsigned int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)), + _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); } + __forceinline vboolf8 operator ==(const vuint8& a, unsigned int b) { return a == vuint8(b); } + __forceinline vboolf8 operator ==(unsigned int a, const vuint8& b) { return vuint8(a) == b; } + + __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); } + __forceinline vboolf8 operator !=(const vuint8& a, unsigned int b) { return a != vuint8(b); } + __forceinline vboolf8 operator !=(unsigned int a, const vuint8& b) { return vuint8(a) != b; } + + //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epu32 (a.vl, b.vl)), + // _mm_castsi128_ps(_mm_cmplt_epu32 (a.vh, b.vh))); } + //__forceinline vboolf8 operator < (const vuint8& a, unsigned int b) { return a < vuint8(b); } + //__forceinline vboolf8 operator < (unsigned int a, const vuint8& b) { return vuint8(a) < b; } + + //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a < b); } + //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int b) { return a >= vuint8(b); } + //__forceinline vboolf8 operator >=(unsigned int a, const vuint8& b) { return vuint8(a) >= b; } + + //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epu32 (a.vl, b.vl)), + // _mm_castsi128_ps(_mm_cmpgt_epu32 (a.vh, b.vh))); } + //__forceinline vboolf8 operator > (const vuint8& a, unsigned int b) { return a > vuint8(b); } + //__forceinline vboolf8 operator > (unsigned int a, const vuint8& b) { return vuint8(a) > b; } + + //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a > b); } + //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int b) { return a <= vuint8(b); } + //__forceinline vboolf8 operator <=(unsigned int a, const vuint8& b) { return vuint8(a) <= b; } + + __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; } + __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; } + + __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); } + __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); } + + __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } + + __forceinline vuint8 notand(const vboolf8& m, const vuint8& f) { + return _mm256_castps_si256(_mm256_andnot_ps(m, _mm256_castsi256_ps(f))); + } + + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); } + + template + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vuint8 shuffle4(const vuint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + __forceinline vuint8 broadcast(const unsigned int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } + template __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); } + template __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); } + //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); } + + //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); } + //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h b/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h new file mode 100644 index 00000000000..b2a965448d5 --- /dev/null +++ b/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h @@ -0,0 +1,439 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* 8-wide AVX integer type */ + template<> + struct vuint<8> + { + ALIGNED_STRUCT_(32); + + typedef vboolf8 Bool; + typedef vuint8 Int; + typedef vfloat8 Float; + + enum { size = 8 }; // number of SIMD elements + union { // data + __m256i v; + unsigned int i[8]; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint() {} + __forceinline vuint(const vuint8& a) { v = a.v; } + __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; } + + __forceinline vuint(__m256i a) : v(a) {} + __forceinline operator const __m256i&() const { return v; } + __forceinline operator __m256i&() { return v; } + + __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {} + __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + __forceinline vuint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {} + + __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {} + __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {} + __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {} + __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {} + + __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {} + +#if defined(__AVX512VL__) + __forceinline explicit vuint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {} +#else + __forceinline explicit vuint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {} +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Constants + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint(ZeroTy) : v(_mm256_setzero_si256()) {} + __forceinline vuint(OneTy) : v(_mm256_set1_epi32(1)) {} + __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {} + __forceinline vuint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {} + __forceinline vuint(StepTy) : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {} + __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {} + + //////////////////////////////////////////////////////////////////////////////// + /// Loads and Stores + //////////////////////////////////////////////////////////////////////////////// + + static __forceinline vuint8 load(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vuint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } + static __forceinline vuint8 load(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); } + static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); } + + static __forceinline vuint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); } + static __forceinline vuint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); } + + static __forceinline void store (void* ptr, const vuint8& v) { _mm256_store_si256((__m256i*)ptr,v); } + static __forceinline void storeu(void* ptr, const vuint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); } + +#if defined(__AVX512VL__) + + static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &v) { + return _mm256_mask_compress_epi32(v, mask, v); + } + static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &a, const vuint8& b) { + return _mm256_mask_compress_epi32(a, mask, b); + } + + static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); } + static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); } +#else + static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); } + + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); } +#endif + + static __forceinline vuint8 load_nt(void* ptr) { + return _mm256_stream_load_si256((__m256i*)ptr); + } + + static __forceinline void store_nt(void* ptr, const vuint8& v) { + _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); + } + + static __forceinline void store(uint8_t* ptr, const vuint8& i) + { + for (size_t j=0; j<8; j++) + ptr[j] = i[j]; + } + + static __forceinline void store(unsigned short* ptr, const vuint8& v) { + for (size_t i=0;i<8;i++) + ptr[i] = (unsigned short)v[i]; + } + + template + static __forceinline vuint8 gather(const unsigned int *const ptr, const vint8& index) { + return _mm256_i32gather_epi32((const int*) ptr, index, scale); + } + + template + static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int *const ptr, const vint8& index) { + vuint8 r = zero; +#if defined(__AVX512VL__) + return _mm256_mmask_i32gather_epi32(r, mask, index, (const int*) ptr, scale); +#else + return _mm256_mask_i32gather_epi32(r, (const int*) ptr, index, mask, scale); +#endif + } + + template + static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v) + { +#if defined(__AVX512VL__) + _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale); +#else + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[0]) = v[0]; + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[1]) = v[1]; + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[2]) = v[2]; + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[3]) = v[3]; + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[4]) = v[4]; + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[5]) = v[5]; + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[6]) = v[6]; + *(unsigned int*)(((int8_t*)ptr) + scale * ofs[7]) = v[7]; +#endif + } + + template + static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v) + { +#if defined(__AVX512VL__) + _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale); +#else + if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0]; + if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1]; + if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2]; + if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3]; + if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4]; + if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5]; + if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6]; + if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7]; +#endif + } + + static __forceinline vuint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Array Access + //////////////////////////////////////////////////////////////////////////////// + + __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; } + __forceinline unsigned int& operator [](size_t index) { assert(index < 8); return i[index]; } + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Unary Operators + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_movepi32_mask(a); } +#else + __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); } +#endif + + __forceinline vuint8 operator +(const vuint8& a) { return a; } + + //////////////////////////////////////////////////////////////////////////////// + /// Binary Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return _mm256_add_epi32(a, b); } + __forceinline vuint8 operator +(const vuint8& a, unsigned int b) { return a + vuint8(b); } + __forceinline vuint8 operator +(unsigned int a, const vuint8& b) { return vuint8(a) + b; } + + __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return _mm256_sub_epi32(a, b); } + __forceinline vuint8 operator -(const vuint8& a, unsigned int b) { return a - vuint8(b); } + __forceinline vuint8 operator -(unsigned int a, const vuint8& b) { return vuint8(a) - b; } + + //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return _mm256_mullo_epu32(a, b); } + //__forceinline vuint8 operator *(const vuint8& a, unsigned int b) { return a * vuint8(b); } + //__forceinline vuint8 operator *(unsigned int a, const vuint8& b) { return vuint8(a) * b; } + + __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_and_si256(a, b); } + __forceinline vuint8 operator &(const vuint8& a, unsigned int b) { return a & vuint8(b); } + __forceinline vuint8 operator &(unsigned int a, const vuint8& b) { return vuint8(a) & b; } + + __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_or_si256(a, b); } + __forceinline vuint8 operator |(const vuint8& a, unsigned int b) { return a | vuint8(b); } + __forceinline vuint8 operator |(unsigned int a, const vuint8& b) { return vuint8(a) | b; } + + __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_xor_si256(a, b); } + __forceinline vuint8 operator ^(const vuint8& a, unsigned int b) { return a ^ vuint8(b); } + __forceinline vuint8 operator ^(unsigned int a, const vuint8& b) { return vuint8(a) ^ b; } + + __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return _mm256_slli_epi32(a, n); } + __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return _mm256_srli_epi32(a, n); } + + __forceinline vuint8 operator <<(const vuint8& a, const vuint8& n) { return _mm256_sllv_epi32(a, n); } + __forceinline vuint8 operator >>(const vuint8& a, const vuint8& n) { return _mm256_srlv_epi32(a, n); } + + __forceinline vuint8 sll(const vuint8& a, unsigned int b) { return _mm256_slli_epi32(a, b); } + __forceinline vuint8 sra(const vuint8& a, unsigned int b) { return _mm256_srai_epi32(a, b); } + __forceinline vuint8 srl(const vuint8& a, unsigned int b) { return _mm256_srli_epi32(a, b); } + + __forceinline vuint8 sll(const vuint8& a, const vuint8& b) { return _mm256_sllv_epi32(a, b); } + __forceinline vuint8 sra(const vuint8& a, const vuint8& b) { return _mm256_srav_epi32(a, b); } + __forceinline vuint8 srl(const vuint8& a, const vuint8& b) { return _mm256_srlv_epi32(a, b); } + + __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return _mm256_min_epu32(a, b); } + __forceinline vuint8 min(const vuint8& a, unsigned int b) { return min(a,vuint8(b)); } + __forceinline vuint8 min(unsigned int a, const vuint8& b) { return min(vuint8(a),b); } + + __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return _mm256_max_epu32(a, b); } + __forceinline vuint8 max(const vuint8& a, unsigned int b) { return max(a,vuint8(b)); } + __forceinline vuint8 max(unsigned int a, const vuint8& b) { return max(vuint8(a),b); } + + //////////////////////////////////////////////////////////////////////////////// + /// Assignment Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; } + __forceinline vuint8& operator +=(vuint8& a, unsigned int b) { return a = a + b; } + + __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; } + __forceinline vuint8& operator -=(vuint8& a, unsigned int b) { return a = a - b; } + + //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; } + //__forceinline vuint8& operator *=(vuint8& a, unsigned int b) { return a = a * b; } + + __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; } + __forceinline vuint8& operator &=(vuint8& a, unsigned int b) { return a = a & b; } + + __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; } + __forceinline vuint8& operator |=(vuint8& a, unsigned int b) { return a = a | b; } + + __forceinline vuint8& operator <<=(vuint8& a, const unsigned int b) { return a = a << b; } + __forceinline vuint8& operator >>=(vuint8& a, const unsigned int b) { return a = a >> b; } + + //////////////////////////////////////////////////////////////////////////////// + /// Comparison Operators + Select + //////////////////////////////////////////////////////////////////////////////// + +#if defined(__AVX512VL__) + __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); } + __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_NE); } + __forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LT); } + __forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GE); } + __forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GT); } + __forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LE); } + + __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { + return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t); + } +#else + __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); } + __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); } + //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(b, a)); } + //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a < b); } + //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(a, b)); } + //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a > b); } + + __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); + } +#endif + + template + __forceinline vuint8 select(const vuint8& t, const vuint8& f) { + return _mm256_blend_epi32(f, t, mask); + } + + __forceinline vboolf8 operator ==(const vuint8& a, unsigned int b) { return a == vuint8(b); } + __forceinline vboolf8 operator ==(unsigned int a, const vuint8& b) { return vuint8(a) == b; } + + __forceinline vboolf8 operator !=(const vuint8& a, unsigned int b) { return a != vuint8(b); } + __forceinline vboolf8 operator !=(unsigned int a, const vuint8& b) { return vuint8(a) != b; } + + //__forceinline vboolf8 operator < (const vuint8& a, unsigned int b) { return a < vuint8(b); } + //__forceinline vboolf8 operator < (unsigned int a, const vuint8& b) { return vuint8(a) < b; } + + //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int b) { return a >= vuint8(b); } + //__forceinline vboolf8 operator >=(unsigned int a, const vuint8& b) { return vuint8(a) >= b; } + + //__forceinline vboolf8 operator > (const vuint8& a, unsigned int b) { return a > vuint8(b); } + //__forceinline vboolf8 operator > (unsigned int a, const vuint8& b) { return vuint8(a) > b; } + + //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int b) { return a <= vuint8(b); } + //__forceinline vboolf8 operator <=(unsigned int a, const vuint8& b) { return vuint8(a) <= b; } + + __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; } + __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; } + //__forceinline vboolf8 lt(const vuint8& a, const vuint8& b) { return a < b; } + //__forceinline vboolf8 ge(const vuint8& a, const vuint8& b) { return a >= b; } + //__forceinline vboolf8 gt(const vuint8& a, const vuint8& b) { return a > b; } + //__forceinline vboolf8 le(const vuint8& a, const vuint8& b) { return a <= b; } + +#if defined(__AVX512VL__) + __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); } + __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); } + __forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); } + __forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); } + __forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); } + __forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); } +#else + __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); } + __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); } + //__forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a < b); } + //__forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >= b); } + //__forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a > b); } + //__forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <= b); } +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Movement/Shifting/Shuffling Functions + //////////////////////////////////////////////////////////////////////////////// + + __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_unpacklo_epi32(a, b); } + __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_unpackhi_epi32(a, b); } + + template + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i))); + } + + template + __forceinline vuint8 shuffle4(const vuint8& v) { + return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) { + return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)); + } + + template + __forceinline vuint8 shuffle(const vuint8& v) { + return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template + __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) { + return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); + } + + template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); } + template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); } + + __forceinline vuint8 broadcast(const unsigned int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); } + + template __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); } + template __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); } + template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); } + + __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } + +#if !defined(__aarch64__) + + __forceinline vuint8 permute(const vuint8& v, const __m256i& index) { + return _mm256_permutevar8x32_epi32(v, index); + } + + __forceinline vuint8 shuffle(const vuint8& v, const __m256i& index) { + return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index)); + } + + template + __forceinline vuint8 align_shift_right(const vuint8& a, const vuint8& b) { +#if defined(__AVX512VL__) + return _mm256_alignr_epi32(a, b, i); +#else + return _mm256_alignr_epi8(a, b, 4*i); +#endif + } + +#endif + + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// + + //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } + + //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); } + //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); } + //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); } + + __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); } + __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); } + __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); } + + //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); } + //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); } + __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); } + + //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); } + //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); } + + //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); } + //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); } + + __forceinline vuint8 assign(const vuint4& a) { return _mm256_castsi128_si256(a); } + + //////////////////////////////////////////////////////////////////////////////// + /// Output Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) { + return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">"; + } +} diff --git a/thirdparty/embree-aarch64/common/sys/alloc.cpp b/thirdparty/embree-aarch64/common/sys/alloc.cpp new file mode 100644 index 00000000000..12f143f1317 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/alloc.cpp @@ -0,0 +1,327 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "alloc.h" +#include "intrinsics.h" +#include "sysinfo.h" +#include "mutex.h" + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +namespace embree +{ + void* alignedMalloc(size_t size, size_t align) + { + if (size == 0) + return nullptr; + + assert((align & (align-1)) == 0); + void* ptr = _mm_malloc(size,align); + + if (size != 0 && ptr == nullptr) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return ptr; + } + + void alignedFree(void* ptr) + { + if (ptr) + _mm_free(ptr); + } + + static bool huge_pages_enabled = false; + static MutexSys os_init_mutex; + + __forceinline bool isHugePageCandidate(const size_t bytes) + { + if (!huge_pages_enabled) + return false; + + /* use huge pages only when memory overhead is low */ + const size_t hbytes = (bytes+PAGE_SIZE_2M-1) & ~size_t(PAGE_SIZE_2M-1); + return 66*(hbytes-bytes) < bytes; // at most 1.5% overhead + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#ifdef _WIN32 + +#define WIN32_LEAN_AND_MEAN +#include +#include + +namespace embree +{ + bool win_enable_selockmemoryprivilege (bool verbose) + { + HANDLE hToken; + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY | TOKEN_ADJUST_PRIVILEGES, &hToken)) { + if (verbose) std::cout << "WARNING: OpenProcessToken failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl; + return false; + } + + TOKEN_PRIVILEGES tp; + tp.PrivilegeCount = 1; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + if (!LookupPrivilegeValueW(nullptr, L"SeLockMemoryPrivilege", &tp.Privileges[0].Luid)) { + if (verbose) std::cout << "WARNING: LookupPrivilegeValue failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl; + return false; + } + + SetLastError(ERROR_SUCCESS); + if (!AdjustTokenPrivileges(hToken, FALSE, &tp, sizeof(tp), nullptr, 0)) { + if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed while trying to enable SeLockMemoryPrivilege" << std::endl; + return false; + } + + if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) { + if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed to enable SeLockMemoryPrivilege: Add SeLockMemoryPrivilege for current user and run process in elevated mode (Run as administrator)." << std::endl; + return false; + } + + return true; + } + + bool os_init(bool hugepages, bool verbose) + { + Lock lock(os_init_mutex); + + if (!hugepages) { + huge_pages_enabled = false; + return true; + } + + if (GetLargePageMinimum() != PAGE_SIZE_2M) { + huge_pages_enabled = false; + return false; + } + + huge_pages_enabled = true; + return true; + } + + void* os_malloc(size_t bytes, bool& hugepages) + { + if (bytes == 0) { + hugepages = false; + return nullptr; + } + + /* try direct huge page allocation first */ + if (isHugePageCandidate(bytes)) + { + int flags = MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES; + char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); + if (ptr != nullptr) { + hugepages = true; + return ptr; + } + } + + /* fall back to 4k pages */ + int flags = MEM_COMMIT | MEM_RESERVE; + char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); + // -- GODOT start -- + // if (ptr == nullptr) throw std::bad_alloc(); + if (ptr == nullptr) abort(); + // -- GODOT end -- + hugepages = false; + return ptr; + } + + size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) + { + if (hugepages) // decommitting huge pages seems not to work under Windows + return bytesOld; + + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1); + bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1); + if (bytesNew >= bytesOld) + return bytesOld; + + if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return bytesNew; + } + + void os_free(void* ptr, size_t bytes, bool hugepages) + { + if (bytes == 0) + return; + + if (!VirtualFree(ptr,0,MEM_RELEASE)) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + } + + void os_advise(void *ptr, size_t bytes) + { + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include +#include +#include +#include +#include + +#if defined(__MACOSX__) +#include +#endif + +namespace embree +{ + bool os_init(bool hugepages, bool verbose) + { + Lock lock(os_init_mutex); + + if (!hugepages) { + huge_pages_enabled = false; + return true; + } + +#if defined(__LINUX__) + + int hugepagesize = 0; + + std::ifstream file; + file.open("/proc/meminfo",std::ios::in); + if (!file.is_open()) { + if (verbose) std::cout << "WARNING: Could not open /proc/meminfo. Huge page support cannot get enabled!" << std::endl; + huge_pages_enabled = false; + return false; + } + + std::string line; + while (getline(file,line)) + { + std::stringstream sline(line); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string tag; getline(sline,tag,' '); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string val; getline(sline,val,' '); + while (!sline.eof() && sline.peek() == ' ') sline.ignore(); + std::string unit; getline(sline,unit,' '); + if (tag == "Hugepagesize:" && unit == "kB") { + hugepagesize = std::stoi(val)*1024; + break; + } + } + + if (hugepagesize != PAGE_SIZE_2M) + { + if (verbose) std::cout << "WARNING: Only 2MB huge pages supported. Huge page support cannot get enabled!" << std::endl; + huge_pages_enabled = false; + return false; + } +#endif + + huge_pages_enabled = true; + return true; + } + + void* os_malloc(size_t bytes, bool& hugepages) + { + if (bytes == 0) { + hugepages = false; + return nullptr; + } + + /* try direct huge page allocation first */ + if (isHugePageCandidate(bytes)) + { +#if defined(__MACOSX__) + void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); + if (ptr != MAP_FAILED) { + hugepages = true; + return ptr; + } +#elif defined(MAP_HUGETLB) + void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0); + if (ptr != MAP_FAILED) { + hugepages = true; + return ptr; + } +#endif + } + + /* fallback to 4k pages */ + void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + // -- GODOT start -- + // if (ptr == MAP_FAILED) throw std::bad_alloc(); + if (ptr == MAP_FAILED) abort(); + // -- GODOT end -- + hugepages = false; + + /* advise huge page hint for THP */ + os_advise(ptr,bytes); + return ptr; + } + + size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) + { + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1); + bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1); + if (bytesNew >= bytesOld) + return bytesOld; + + if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + + return bytesNew; + } + + void os_free(void* ptr, size_t bytes, bool hugepages) + { + if (bytes == 0) + return; + + /* for hugepages we need to also align the size */ + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytes = (bytes+pageSize-1) & ~(pageSize-1); + if (munmap(ptr,bytes) == -1) + // -- GODOT start -- + // throw std::bad_alloc(); + abort(); + // -- GODOT end -- + } + + /* hint for transparent huge pages (THP) */ + void os_advise(void* pptr, size_t bytes) + { +#if defined(MADV_HUGEPAGE) + madvise(pptr,bytes,MADV_HUGEPAGE); +#endif + } +} + +#endif diff --git a/thirdparty/embree-aarch64/common/sys/alloc.h b/thirdparty/embree-aarch64/common/sys/alloc.h new file mode 100644 index 00000000000..5898ecda709 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/alloc.h @@ -0,0 +1,164 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include +#include + +namespace embree +{ +#define ALIGNED_STRUCT_(align) \ + void* operator new(size_t size) { return alignedMalloc(size,align); } \ + void operator delete(void* ptr) { alignedFree(ptr); } \ + void* operator new[](size_t size) { return alignedMalloc(size,align); } \ + void operator delete[](void* ptr) { alignedFree(ptr); } + +#define ALIGNED_CLASS_(align) \ + public: \ + ALIGNED_STRUCT_(align) \ + private: + + /*! aligned allocation */ + void* alignedMalloc(size_t size, size_t align); + void alignedFree(void* ptr); + + /*! allocator that performs aligned allocations */ + template + struct aligned_allocator + { + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + __forceinline pointer allocate( size_type n ) { + return (pointer) alignedMalloc(n*sizeof(value_type),alignment); + } + + __forceinline void deallocate( pointer p, size_type n ) { + return alignedFree(p); + } + + __forceinline void construct( pointer p, const_reference val ) { + new (p) T(val); + } + + __forceinline void destroy( pointer p ) { + p->~T(); + } + }; + + /*! allocates pages directly from OS */ + bool win_enable_selockmemoryprivilege(bool verbose); + bool os_init(bool hugepages, bool verbose); + void* os_malloc (size_t bytes, bool& hugepages); + size_t os_shrink (void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages); + void os_free (void* ptr, size_t bytes, bool hugepages); + void os_advise (void* ptr, size_t bytes); + + /*! allocator that performs OS allocations */ + template + struct os_allocator + { + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + __forceinline os_allocator () + : hugepages(false) {} + + __forceinline pointer allocate( size_type n ) { + return (pointer) os_malloc(n*sizeof(value_type),hugepages); + } + + __forceinline void deallocate( pointer p, size_type n ) { + return os_free(p,n*sizeof(value_type),hugepages); + } + + __forceinline void construct( pointer p, const_reference val ) { + new (p) T(val); + } + + __forceinline void destroy( pointer p ) { + p->~T(); + } + + bool hugepages; + }; + + /*! allocator for IDs */ + template + struct IDPool + { + typedef T value_type; + + IDPool () + : nextID(0) {} + + T allocate() + { + /* return ID from list */ + if (!IDs.empty()) + { + T id = *IDs.begin(); + IDs.erase(IDs.begin()); + return id; + } + + /* allocate new ID */ + else + { + if (size_t(nextID)+1 > max_id) + return -1; + + return nextID++; + } + } + + /* adds an ID provided by the user */ + bool add(T id) + { + if (id > max_id) + return false; + + /* check if ID should be in IDs set */ + if (id < nextID) { + auto p = IDs.find(id); + if (p == IDs.end()) return false; + IDs.erase(p); + return true; + } + + /* otherwise increase ID set */ + else + { + for (T i=nextID; i IDs; //!< stores deallocated IDs to be reused + T nextID; //!< next ID to use when IDs vector is empty + }; +} + diff --git a/thirdparty/embree-aarch64/common/sys/array.h b/thirdparty/embree-aarch64/common/sys/array.h new file mode 100644 index 00000000000..77722a39f67 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/array.h @@ -0,0 +1,222 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "alloc.h" + +namespace embree +{ + /*! static array with static size */ + template + class array_t + { + public: + + /********************** Iterators ****************************/ + + __forceinline T* begin() const { return items; }; + __forceinline T* end () const { return items+N; }; + + + /********************** Capacity ****************************/ + + __forceinline bool empty () const { return N == 0; } + __forceinline size_t size () const { return N; } + __forceinline size_t max_size () const { return N; } + + + /******************** Element access **************************/ + + __forceinline T& operator[](size_t i) { assert(i < N); return items[i]; } + __forceinline const T& operator[](size_t i) const { assert(i < N); return items[i]; } + + __forceinline T& at(size_t i) { assert(i < N); return items[i]; } + __forceinline const T& at(size_t i) const { assert(i < N); return items[i]; } + + __forceinline T& front() const { assert(N > 0); return items[0]; }; + __forceinline T& back () const { assert(N > 0); return items[N-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + private: + T items[N]; + }; + + /*! static array with dynamic size */ + template + class darray_t + { + public: + + __forceinline darray_t () : M(0) {} + + __forceinline darray_t (const T& v) : M(0) { + for (size_t i=0; i 0); return items[0]; }; + __forceinline T& back () const { assert(M > 0); return items[M-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + private: + size_t M; + T items[N]; + }; + + /*! dynamic sized array that is allocated on the stack */ +#define dynamic_large_stack_array(Ty,Name,N,max_stack_bytes) StackArray Name(N) + template + struct __aligned(64) StackArray + { + __forceinline StackArray (const size_t N) + : N(N) + { + if (N*sizeof(Ty) <= max_stack_bytes) + data = &arr[0]; + else + data = (Ty*) alignedMalloc(N*sizeof(Ty),64); + } + + __forceinline ~StackArray () { + if (data != &arr[0]) alignedFree(data); + } + + __forceinline operator Ty* () { return data; } + __forceinline operator const Ty* () const { return data; } + + __forceinline Ty& operator[](const int i) { assert(i>=0 && i=0 && i + struct __aligned(64) DynamicStackArray + { + __forceinline DynamicStackArray () + : data(&arr[0]) {} + + __forceinline ~DynamicStackArray () + { + if (!isStackAllocated()) + delete[] data; + } + + __forceinline bool isStackAllocated() const { + return data == &arr[0]; + } + + __forceinline size_t size() const + { + if (isStackAllocated()) return max_stack_elements; + else return max_total_elements; + } + + __forceinline void resize(size_t M) + { + assert(M <= max_total_elements); + if (likely(M <= max_stack_elements)) return; + if (likely(!isStackAllocated())) return; + + data = new Ty[max_total_elements]; + + for (size_t i=0; i=0 && ioperator[] (i) = other[i]; + } + + DynamicStackArray& operator= (const DynamicStackArray& other) + { + for (size_t i=0; ioperator[] (i) = other[i]; + + return *this; + } + + private: + Ty arr[max_stack_elements]; + Ty* data; + }; +} diff --git a/thirdparty/embree-aarch64/common/sys/atomic.h b/thirdparty/embree-aarch64/common/sys/atomic.h new file mode 100644 index 00000000000..ebfb8552c35 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/atomic.h @@ -0,0 +1,59 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include "intrinsics.h" + +namespace embree +{ +/* compiler memory barriers */ +#if defined(__INTEL_COMPILER) +//#define __memory_barrier() __memory_barrier() +#elif defined(__GNUC__) || defined(__clang__) +# define __memory_barrier() asm volatile("" ::: "memory") +#elif defined(_MSC_VER) +# define __memory_barrier() _ReadWriteBarrier() +#endif + + template + struct atomic : public std::atomic + { + atomic () {} + + atomic (const T& a) + : std::atomic(a) {} + + atomic (const atomic& a) { + this->store(a.load()); + } + + atomic& operator=(const atomic& other) { + this->store(other.load()); + return *this; + } + }; + + template + __forceinline void atomic_min(std::atomic& aref, const T& bref) + { + const T b = bref.load(); + while (true) { + T a = aref.load(); + if (a <= b) break; + if (aref.compare_exchange_strong(a,b)) break; + } + } + + template + __forceinline void atomic_max(std::atomic& aref, const T& bref) + { + const T b = bref.load(); + while (true) { + T a = aref.load(); + if (a >= b) break; + if (aref.compare_exchange_strong(a,b)) break; + } + } +} diff --git a/thirdparty/embree-aarch64/common/sys/barrier.cpp b/thirdparty/embree-aarch64/common/sys/barrier.cpp new file mode 100644 index 00000000000..0061d18db25 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/barrier.cpp @@ -0,0 +1,289 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "barrier.h" +#include "condition.h" +#include "regression.h" +#include "thread.h" + +#if defined (__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include + +namespace embree +{ + struct BarrierSysImplementation + { + __forceinline BarrierSysImplementation (size_t N) + : i(0), enterCount(0), exitCount(0), barrierSize(0) + { + events[0] = CreateEvent(nullptr, TRUE, FALSE, nullptr); + events[1] = CreateEvent(nullptr, TRUE, FALSE, nullptr); + init(N); + } + + __forceinline ~BarrierSysImplementation () + { + CloseHandle(events[0]); + CloseHandle(events[1]); + } + + __forceinline void init(size_t N) + { + barrierSize = N; + enterCount.store(N); + exitCount.store(N); + } + + __forceinline void wait() + { + /* every thread entering the barrier decrements this count */ + size_t i0 = i; + size_t cnt0 = enterCount--; + + /* all threads except the last one are wait in the barrier */ + if (cnt0 > 1) + { + if (WaitForSingleObject(events[i0], INFINITE) != WAIT_OBJECT_0) + THROW_RUNTIME_ERROR("WaitForSingleObjects failed"); + } + + /* the last thread starts all threads waiting at the barrier */ + else + { + i = 1-i; + enterCount.store(barrierSize); + if (SetEvent(events[i0]) == 0) + THROW_RUNTIME_ERROR("SetEvent failed"); + } + + /* every thread leaving the barrier decrements this count */ + size_t cnt1 = exitCount--; + + /* the last thread that left the barrier resets the event again */ + if (cnt1 == 1) + { + exitCount.store(barrierSize); + if (ResetEvent(events[i0]) == 0) + THROW_RUNTIME_ERROR("ResetEvent failed"); + } + } + + public: + HANDLE events[2]; + atomic i; + atomic enterCount; + atomic exitCount; + size_t barrierSize; + }; +} + +#else + +namespace embree +{ + struct BarrierSysImplementation + { + __forceinline BarrierSysImplementation (size_t N) + : count(0), barrierSize(0) + { + init(N); + } + + __forceinline void init(size_t N) + { + assert(count == 0); + count = 0; + barrierSize = N; + } + + __forceinline void wait() + { + mutex.lock(); + count++; + + if (count == barrierSize) { + count = 0; + cond.notify_all(); + mutex.unlock(); + return; + } + + cond.wait(mutex); + mutex.unlock(); + return; + } + + public: + MutexSys mutex; + ConditionSys cond; + volatile size_t count; + volatile size_t barrierSize; + }; +} + +#endif + +namespace embree +{ + BarrierSys::BarrierSys (size_t N) { + opaque = new BarrierSysImplementation(N); + } + + BarrierSys::~BarrierSys () { + delete (BarrierSysImplementation*) opaque; + } + + void BarrierSys::init(size_t count) { + ((BarrierSysImplementation*) opaque)->init(count); + } + + void BarrierSys::wait() { + ((BarrierSysImplementation*) opaque)->wait(); + } + + LinearBarrierActive::LinearBarrierActive (size_t N) + : count0(nullptr), count1(nullptr), mode(0), flag0(0), flag1(0), threadCount(0) + { + if (N == 0) N = getNumberOfLogicalThreads(); + init(N); + } + + LinearBarrierActive::~LinearBarrierActive() + { + delete[] count0; + delete[] count1; + } + + void LinearBarrierActive::init(size_t N) + { + if (threadCount != N) { + threadCount = N; + if (count0) delete[] count0; count0 = new unsigned char[N]; + if (count1) delete[] count1; count1 = new unsigned char[N]; + } + mode = 0; + flag0 = 0; + flag1 = 0; + for (size_t i=0; i threadID; + std::atomic numFailed; + std::vector threadResults; + + barrier_sys_regression_test() + : RegressionTest("barrier_sys_regression_test"), threadID(0), numFailed(0) + { + registerRegressionTest(this); + } + + static void thread_alloc(barrier_sys_regression_test* This) + { + size_t tid = This->threadID++; + for (size_t j=0; j<1000; j++) + { + This->barrier.wait(); + This->threadResults[tid] = tid; + This->barrier.wait(); + } + } + + bool run () + { + threadID.store(0); + numFailed.store(0); + + size_t numThreads = getNumberOfLogicalThreads(); + threadResults.resize(numThreads); + barrier.init(numThreads+1); + + /* create threads */ + std::vector threads; + for (size_t i=0; i cntr; + }; + + /*! fast active barrier that does not require initialization to some number of threads */ + struct BarrierActiveAutoReset + { + public: + BarrierActiveAutoReset () + : cntr0(0), cntr1(0) {} + + void wait (size_t threadCount) + { + cntr0.fetch_add(1); + while (cntr0 != threadCount) pause_cpu(); + cntr1.fetch_add(1); + while (cntr1 != threadCount) pause_cpu(); + cntr0.fetch_add(-1); + while (cntr0 != 0) pause_cpu(); + cntr1.fetch_add(-1); + while (cntr1 != 0) pause_cpu(); + } + + private: + std::atomic cntr0; + std::atomic cntr1; + }; + + class LinearBarrierActive + { + public: + + /*! construction and destruction */ + LinearBarrierActive (size_t threadCount = 0); + ~LinearBarrierActive(); + + private: + /*! class in non-copyable */ + LinearBarrierActive (const LinearBarrierActive& other) DELETED; // do not implement + LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement + + public: + /*! intializes the barrier with some number of threads */ + void init(size_t threadCount); + + /*! thread with threadIndex waits in the barrier */ + void wait (const size_t threadIndex); + + private: + volatile unsigned char* count0; + volatile unsigned char* count1; + volatile unsigned int mode; + volatile unsigned int flag0; + volatile unsigned int flag1; + volatile size_t threadCount; + }; +} + diff --git a/thirdparty/embree-aarch64/common/sys/condition.cpp b/thirdparty/embree-aarch64/common/sys/condition.cpp new file mode 100644 index 00000000000..0e7ca7af399 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/condition.cpp @@ -0,0 +1,81 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "condition.h" + +#if defined(__WIN32__) && !defined(PTHREADS_WIN32) + +#define WIN32_LEAN_AND_MEAN +#include + +namespace embree +{ + struct ConditionImplementation + { + __forceinline ConditionImplementation () { + InitializeConditionVariable(&cond); + } + + __forceinline ~ConditionImplementation () { + } + + __forceinline void wait(MutexSys& mutex_in) { + SleepConditionVariableCS(&cond, (LPCRITICAL_SECTION)mutex_in.mutex, INFINITE); + } + + __forceinline void notify_all() { + WakeAllConditionVariable(&cond); + } + + public: + CONDITION_VARIABLE cond; + }; +} +#endif + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) +#include +namespace embree +{ + struct ConditionImplementation + { + __forceinline ConditionImplementation () { + pthread_cond_init(&cond,nullptr); + } + + __forceinline ~ConditionImplementation() { + pthread_cond_destroy(&cond); + } + + __forceinline void wait(MutexSys& mutex) { + pthread_cond_wait(&cond, (pthread_mutex_t*)mutex.mutex); + } + + __forceinline void notify_all() { + pthread_cond_broadcast(&cond); + } + + public: + pthread_cond_t cond; + }; +} +#endif + +namespace embree +{ + ConditionSys::ConditionSys () { + cond = new ConditionImplementation; + } + + ConditionSys::~ConditionSys() { + delete (ConditionImplementation*) cond; + } + + void ConditionSys::wait(MutexSys& mutex) { + ((ConditionImplementation*) cond)->wait(mutex); + } + + void ConditionSys::notify_all() { + ((ConditionImplementation*) cond)->notify_all(); + } +} diff --git a/thirdparty/embree-aarch64/common/sys/condition.h b/thirdparty/embree-aarch64/common/sys/condition.h new file mode 100644 index 00000000000..7a3a05aa81a --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/condition.h @@ -0,0 +1,31 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "mutex.h" + +namespace embree +{ + class ConditionSys + { + public: + ConditionSys(); + ~ConditionSys(); + void wait( class MutexSys& mutex ); + void notify_all(); + + template + __forceinline void wait( class MutexSys& mutex, const Predicate& pred ) + { + while (!pred()) wait(mutex); + } + + private: + ConditionSys (const ConditionSys& other) DELETED; // do not implement + ConditionSys& operator= (const ConditionSys& other) DELETED; // do not implement + + protected: + void* cond; + }; +} diff --git a/thirdparty/embree-aarch64/common/sys/filename.cpp b/thirdparty/embree-aarch64/common/sys/filename.cpp new file mode 100644 index 00000000000..86182c1afbb --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/filename.cpp @@ -0,0 +1,138 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "filename.h" +#include "sysinfo.h" + +namespace embree +{ +#ifdef __WIN32__ + const char path_sep = '\\'; +#else + const char path_sep = '/'; +#endif + + /*! create an empty filename */ + FileName::FileName () {} + + /*! create a valid filename from a string */ + FileName::FileName (const char* in) { + filename = in; + for (size_t i=0; i +#endif + +#if defined(__ARM_NEON) +#include "../math/SSE2NEON.h" +#if defined(NEON_AVX2_EMULATION) +#include "../math/AVX2NEON.h" +#endif +#else +#include +#endif + +#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER) + #if !defined(_tzcnt_u32) + #define _tzcnt_u32 __tzcnt_u32 + #endif + #if !defined(_tzcnt_u64) + #define _tzcnt_u64 __tzcnt_u64 + #endif +#endif + +#if defined(__aarch64__) +#if !defined(_lzcnt_u32) + #define _lzcnt_u32 __builtin_clz +#endif +#if !defined(_lzcnt_u32) + #define _lzcnt_u32 __builtin_clzll +#endif +#else +#if defined(__LZCNT__) + #if !defined(_lzcnt_u32) + #define _lzcnt_u32 __lzcnt32 + #endif + #if !defined(_lzcnt_u64) + #define _lzcnt_u64 __lzcnt64 + #endif +#endif +#endif + +#if defined(__WIN32__) +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#endif + +/* normally defined in pmmintrin.h, but we always need this */ +#if !defined(_MM_SET_DENORMALS_ZERO_MODE) +#define _MM_DENORMALS_ZERO_ON (0x0040) +#define _MM_DENORMALS_ZERO_OFF (0x0000) +#define _MM_DENORMALS_ZERO_MASK (0x0040) +#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) +#endif + +namespace embree +{ + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + + __forceinline size_t read_tsc() + { + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + return (size_t)li.QuadPart; + } + + __forceinline int bsf(int v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return _tzcnt_u32(v); +#else + unsigned long r = 0; _BitScanForward(&r,v); return r; +#endif + } + + __forceinline unsigned bsf(unsigned v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return _tzcnt_u32(v); +#else + unsigned long r = 0; _BitScanForward(&r,v); return r; +#endif + } + +#if defined(__X86_64__) + __forceinline size_t bsf(size_t v) { +#if defined(__AVX2__) + return _tzcnt_u64(v); +#else + unsigned long r = 0; _BitScanForward64(&r,v); return r; +#endif + } +#endif + + __forceinline int bscf(int& v) + { + int i = bsf(v); + v &= v-1; + return i; + } + + __forceinline unsigned bscf(unsigned& v) + { + unsigned i = bsf(v); + v &= v-1; + return i; + } + +#if defined(__X86_64__) + __forceinline size_t bscf(size_t& v) + { + size_t i = bsf(v); + v &= v-1; + return i; + } +#endif + + __forceinline int bsr(int v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return 31 - _lzcnt_u32(v); +#else + unsigned long r = 0; _BitScanReverse(&r,v); return r; +#endif + } + + __forceinline unsigned bsr(unsigned v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return 31 - _lzcnt_u32(v); +#else + unsigned long r = 0; _BitScanReverse(&r,v); return r; +#endif + } + +#if defined(__X86_64__) + __forceinline size_t bsr(size_t v) { +#if defined(__AVX2__) + return 63 -_lzcnt_u64(v); +#else + unsigned long r = 0; _BitScanReverse64(&r, v); return r; +#endif + } +#endif + + __forceinline int lzcnt(const int x) + { +#if defined(__AVX2__) && !defined(__aarch64__) + return _lzcnt_u32(x); +#else + if (unlikely(x == 0)) return 32; + return 31 - bsr(x); +#endif + } + + __forceinline int btc(int v, int i) { + long r = v; _bittestandcomplement(&r,i); return r; + } + + __forceinline int bts(int v, int i) { + long r = v; _bittestandset(&r,i); return r; + } + + __forceinline int btr(int v, int i) { + long r = v; _bittestandreset(&r,i); return r; + } + +#if defined(__X86_64__) + + __forceinline size_t btc(size_t v, size_t i) { + size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r; + } + + __forceinline size_t bts(size_t v, size_t i) { + __int64 r = v; _bittestandset64(&r,i); return r; + } + + __forceinline size_t btr(size_t v, size_t i) { + __int64 r = v; _bittestandreset64(&r,i); return r; + } + +#endif + + __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) { + return _InterlockedCompareExchange((volatile long*)p,v,c); + } + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#else + +#if defined(__i386__) && defined(__PIC__) + + __forceinline void __cpuid(int out[4], int op) + { + asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "0"(op)); + } + + __forceinline void __cpuid_count(int out[4], int op1, int op2) + { + asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) + : "0" (op1), "2" (op2)); + } + +#else + + __forceinline void __cpuid(int out[4], int op) { +#if defined(__ARM_NEON) + if (op == 0) { // Get CPU name + out[0] = 0x41524d20; + out[1] = 0x41524d20; + out[2] = 0x41524d20; + out[3] = 0x41524d20; + } +#else + asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); +#endif + } + +#if !defined(__ARM_NEON) + __forceinline void __cpuid_count(int out[4], int op1, int op2) { + asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); + } +#endif + +#endif + + __forceinline uint64_t read_tsc() { +#if defined(__ARM_NEON) + return 0; // FIXME(LTE): mimic rdtsc +#else + uint32_t high,low; + asm volatile ("rdtsc" : "=d"(high), "=a"(low)); + return (((uint64_t)high) << 32) + (uint64_t)low; +#endif + } + + __forceinline int bsf(int v) { +#if defined(__ARM_NEON) + return __builtin_ctz(v); +#else +#if defined(__AVX2__) + return _tzcnt_u32(v); +#else + int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#endif +#endif + } + +#if defined(__X86_64__) || defined(__aarch64__) + __forceinline unsigned bsf(unsigned v) + { +#if defined(__ARM_NEON) + return __builtin_ctz(v); +#else +#if defined(__AVX2__) + return _tzcnt_u32(v); +#else + unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#endif +#endif + } +#endif + + __forceinline size_t bsf(size_t v) { +#if defined(__AVX2__) && !defined(__aarch64__) +#if defined(__X86_64__) + return _tzcnt_u64(v); +#else + return _tzcnt_u32(v); +#endif +#elif defined(__ARM_NEON) + return __builtin_ctzl(v); +#else + size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; +#endif + } + + __forceinline int bscf(int& v) + { + int i = bsf(v); + v &= v-1; + return i; + } + +#if defined(__X86_64__) || defined(__aarch64__) + __forceinline unsigned int bscf(unsigned int& v) + { + unsigned int i = bsf(v); + v &= v-1; + return i; + } +#endif + + __forceinline size_t bscf(size_t& v) + { + size_t i = bsf(v); + v &= v-1; + return i; + } + + __forceinline int bsr(int v) { +#if defined(__AVX2__) && !defined(__aarch64__) + return 31 - _lzcnt_u32(v); +#elif defined(__ARM_NEON) + return __builtin_clz(v)^31; +#else + int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#endif + } + +#if defined(__X86_64__) || defined(__aarch64__) + __forceinline unsigned bsr(unsigned v) { +#if defined(__AVX2__) + return 31 - _lzcnt_u32(v); +#elif defined(__ARM_NEON) + return __builtin_clz(v)^31; +#else + unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#endif + } +#endif + + __forceinline size_t bsr(size_t v) { +#if defined(__AVX2__) && !defined(__aarch64__) +#if defined(__X86_64__) + return 63 - _lzcnt_u64(v); +#else + return 31 - _lzcnt_u32(v); +#endif +#elif defined(__aarch64__) + return (sizeof(v) * 8 - 1) - __builtin_clzl(v); +#else + size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; +#endif + } + + __forceinline int lzcnt(const int x) + { +#if defined(__AVX2__) && !defined(__aarch64__) + return _lzcnt_u32(x); +#else + if (unlikely(x == 0)) return 32; + return 31 - bsr(x); +#endif + } + + __forceinline size_t blsr(size_t v) { +#if defined(__AVX2__) && !defined(__aarch64__) +#if defined(__INTEL_COMPILER) + return _blsr_u64(v); +#else +#if defined(__X86_64__) + return __blsr_u64(v); +#else + return __blsr_u32(v); +#endif +#endif +#else + return v & (v-1); +#endif + } + + __forceinline int btc(int v, int i) { +#if defined(__aarch64__) + // _bittestandcomplement(long *a, long b) { + // unsigned char x = (*a >> b) & 1; + // *a = *a ^ (1 << b); + // return x; + + // We only need `*a` + return (v ^ (1 << i)); +#else + int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; +#endif + } + + __forceinline int bts(int v, int i) { +#if defined(__aarch64__) + // _bittestandset(long *a, long b) { + // unsigned char x = (*a >> b) & 1; + // *a = *a | (1 << b); + // return x; + return (v | (v << i)); +#else + int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#endif + } + + __forceinline int btr(int v, int i) { +#if defined(__aarch64__) + // _bittestandreset(long *a, long b) { + // unsigned char x = (*a >> b) & 1; + // *a = *a & ~(1 << b); + // return x; + return (v & ~(v << i)); +#else + int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#endif + } + + __forceinline size_t btc(size_t v, size_t i) { +#if defined(__aarch64__) + return (v ^ (1 << i)); +#else + size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; +#endif + } + + __forceinline size_t bts(size_t v, size_t i) { +#if defined(__aarch64__) + return (v | (v << i)); +#else + size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#endif + } + + __forceinline size_t btr(size_t v, size_t i) { +#if defined(__ARM_NEON) + return (v & ~(v << i)); +#else + size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; +#endif + } + + __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) { + return __sync_val_compare_and_swap(value, comparand, input); + } + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__clang__) || defined(__GNUC__) +#if !defined(_mm_undefined_ps) + __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); } +#endif +#if !defined(_mm_undefined_si128) + __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); } +#endif +#if !defined(_mm256_undefined_ps) && defined(__AVX__) + __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); } +#endif +#if !defined(_mm256_undefined_si256) && defined(__AVX__) + __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); } +#endif +#if !defined(_mm512_undefined_ps) && defined(__AVX512F__) + __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); } +#endif +#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__) + __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); } +#endif +#endif + +#if defined(__SSE4_2__) || defined(__ARM_NEON) + + __forceinline int popcnt(int in) { + return _mm_popcnt_u32(in); + } + + __forceinline unsigned popcnt(unsigned in) { + return _mm_popcnt_u32(in); + } + +#if defined(__X86_64__) || defined(__ARM_NEON) + __forceinline size_t popcnt(size_t in) { + return _mm_popcnt_u64(in); + } +#endif + +#endif + + __forceinline uint64_t rdtsc() + { + int dummy[4]; + __cpuid(dummy,0); + uint64_t clock = read_tsc(); + __cpuid(dummy,0); + return clock; + } + + __forceinline void pause_cpu(const size_t N = 8) + { + for (size_t i=0; i + +namespace embree +{ + /* opens a shared library */ + lib_t openLibrary(const std::string& file) + { + std::string fullName = file+".dll"; + FileName executable = getExecutableFileName(); + HANDLE handle = LoadLibrary((executable.path() + fullName).c_str()); + return lib_t(handle); + } + + /* returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym) { + return reinterpret_cast(GetProcAddress(HMODULE(lib),sym.c_str())); + } + + /* closes the shared library */ + void closeLibrary(lib_t lib) { + FreeLibrary(HMODULE(lib)); + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include + +namespace embree +{ + /* opens a shared library */ + lib_t openLibrary(const std::string& file) + { +#if defined(__MACOSX__) + std::string fullName = "lib"+file+".dylib"; +#else + std::string fullName = "lib"+file+".so"; +#endif + void* lib = dlopen(fullName.c_str(), RTLD_NOW); + if (lib) return lib_t(lib); + FileName executable = getExecutableFileName(); + lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW); + if (lib == nullptr) { + const char* error = dlerror(); + if (error) { + THROW_RUNTIME_ERROR(error); + } else { + THROW_RUNTIME_ERROR("could not load library "+executable.str()); + } + } + return lib_t(lib); + } + + /* returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym) { + return dlsym(lib,sym.c_str()); + } + + /* closes the shared library */ + void closeLibrary(lib_t lib) { + dlclose(lib); + } +} +#endif diff --git a/thirdparty/embree-aarch64/common/sys/library.h b/thirdparty/embree-aarch64/common/sys/library.h new file mode 100644 index 00000000000..c2164e9fbe0 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/library.h @@ -0,0 +1,21 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +namespace embree +{ + /*! type for shared library */ + typedef struct opaque_lib_t* lib_t; + + /*! loads a shared library */ + lib_t openLibrary(const std::string& file); + + /*! returns address of a symbol from the library */ + void* getSymbol(lib_t lib, const std::string& sym); + + /*! unloads a shared library */ + void closeLibrary(lib_t lib); +} diff --git a/thirdparty/embree-aarch64/common/sys/mutex.cpp b/thirdparty/embree-aarch64/common/sys/mutex.cpp new file mode 100644 index 00000000000..11779bc9b9e --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/mutex.cpp @@ -0,0 +1,58 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "mutex.h" +#include "regression.h" + +#if defined(__WIN32__) && !defined(PTHREADS_WIN32) + +#define WIN32_LEAN_AND_MEAN +#include + +namespace embree +{ + MutexSys::MutexSys() { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); } + MutexSys::~MutexSys() { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete (CRITICAL_SECTION*)mutex; } + void MutexSys::lock() { EnterCriticalSection((CRITICAL_SECTION*)mutex); } + bool MutexSys::try_lock() { return TryEnterCriticalSection((CRITICAL_SECTION*)mutex) != 0; } + void MutexSys::unlock() { LeaveCriticalSection((CRITICAL_SECTION*)mutex); } +} +#endif + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) +#include +namespace embree +{ + /*! system mutex using pthreads */ + MutexSys::MutexSys() + { + mutex = new pthread_mutex_t; + if (pthread_mutex_init((pthread_mutex_t*)mutex, nullptr) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_init failed"); + } + + MutexSys::~MutexSys() + { + MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0; + assert(ok); + delete (pthread_mutex_t*)mutex; + mutex = nullptr; + } + + void MutexSys::lock() + { + if (pthread_mutex_lock((pthread_mutex_t*)mutex) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_lock failed"); + } + + bool MutexSys::try_lock() { + return pthread_mutex_trylock((pthread_mutex_t*)mutex) == 0; + } + + void MutexSys::unlock() + { + if (pthread_mutex_unlock((pthread_mutex_t*)mutex) != 0) + THROW_RUNTIME_ERROR("pthread_mutex_unlock failed"); + } +}; +#endif diff --git a/thirdparty/embree-aarch64/common/sys/mutex.h b/thirdparty/embree-aarch64/common/sys/mutex.h new file mode 100644 index 00000000000..1164210f23a --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/mutex.h @@ -0,0 +1,98 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "intrinsics.h" +#include "atomic.h" + +namespace embree +{ + /*! system mutex */ + class MutexSys { + friend struct ConditionImplementation; + public: + MutexSys(); + ~MutexSys(); + + private: + MutexSys (const MutexSys& other) DELETED; // do not implement + MutexSys& operator= (const MutexSys& other) DELETED; // do not implement + + public: + void lock(); + bool try_lock(); + void unlock(); + + protected: + void* mutex; + }; + + /*! spinning mutex */ + class SpinLock + { + public: + + SpinLock () + : flag(false) {} + + __forceinline bool isLocked() { + return flag.load(); + } + + __forceinline void lock() + { + while (true) + { + while (flag.load()) + { + _mm_pause(); + _mm_pause(); + } + + bool expected = false; + if (flag.compare_exchange_strong(expected,true,std::memory_order_acquire)) + break; + } + } + + __forceinline bool try_lock() + { + bool expected = false; + if (flag.load() != expected) { + return false; + } + return flag.compare_exchange_strong(expected,true,std::memory_order_acquire); + } + + __forceinline void unlock() { + flag.store(false,std::memory_order_release); + } + + __forceinline void wait_until_unlocked() + { + while(flag.load()) + { + _mm_pause(); + _mm_pause(); + } + } + + public: + atomic flag; + }; + + /*! safe mutex lock and unlock helper */ + template class Lock { + public: + Lock (Mutex& mutex) : mutex(mutex), locked(true) { mutex.lock(); } + Lock (Mutex& mutex, bool locked) : mutex(mutex), locked(locked) {} + ~Lock() { if (locked) mutex.unlock(); } + __forceinline void lock() { assert(!locked); locked = true; mutex.lock(); } + __forceinline bool isLocked() const { return locked; } + protected: + Mutex& mutex; + bool locked; + }; +} diff --git a/thirdparty/embree-aarch64/common/sys/platform.h b/thirdparty/embree-aarch64/common/sys/platform.h new file mode 100644 index 00000000000..737f14aa6e3 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/platform.h @@ -0,0 +1,387 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define _CRT_SECURE_NO_WARNINGS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//////////////////////////////////////////////////////////////////////////////// +/// detect platform +//////////////////////////////////////////////////////////////////////////////// + +/* detect 32 or 64 platform */ +#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) +#define __X86_64__ +#endif + +/* detect Linux platform */ +#if defined(linux) || defined(__linux__) || defined(__LINUX__) +# if !defined(__LINUX__) +# define __LINUX__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect FreeBSD platform */ +#if defined(__FreeBSD__) || defined(__FREEBSD__) +# if !defined(__FREEBSD__) +# define __FREEBSD__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect Windows 95/98/NT/2000/XP/Vista/7/8/10 platform */ +#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__) +# if !defined(__WIN32__) +# define __WIN32__ +# endif +#endif + +/* detect Cygwin platform */ +#if defined(__CYGWIN__) +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* detect MAC OS X platform */ +#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__) +# if !defined(__MACOSX__) +# define __MACOSX__ +# endif +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +/* try to detect other Unix systems */ +#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix) +# if !defined(__UNIX__) +# define __UNIX__ +# endif +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Macros +//////////////////////////////////////////////////////////////////////////////// + +#ifdef __WIN32__ +#define dll_export __declspec(dllexport) +#define dll_import __declspec(dllimport) +#else +#define dll_export __attribute__ ((visibility ("default"))) +#define dll_import +#endif + +#ifdef __WIN32__ +#if !defined(__noinline) +#define __noinline __declspec(noinline) +#endif +//#define __forceinline __forceinline +//#define __restrict __restrict +#if defined(__INTEL_COMPILER) +#define __restrict__ __restrict +#else +#define __restrict__ //__restrict // causes issues with MSVC +#endif +#if !defined(__thread) +// NOTE: Require `-fms-extensions` for clang +#define __thread __declspec(thread) +#endif +#if !defined(__aligned) +#if defined(__MINGW32__) +#define __aligned(...) __attribute__((aligned(__VA_ARGS__))) +#else +#define __aligned(...) __declspec(align(__VA_ARGS__)) +#endif +#endif +//#define __FUNCTION__ __FUNCTION__ +#define debugbreak() __debugbreak() + +#else +#if !defined(__noinline) +#define __noinline __attribute__((noinline)) +#endif +#if !defined(__forceinline) +#define __forceinline inline __attribute__((always_inline)) +#endif +//#define __restrict __restrict +//#define __thread __thread +#if !defined(__aligned) +#define __aligned(...) __attribute__((aligned(__VA_ARGS__))) +#endif +#if !defined(__FUNCTION__) +#define __FUNCTION__ __PRETTY_FUNCTION__ +#endif +#define debugbreak() asm ("int $3") +#endif + +#if defined(__clang__) || defined(__GNUC__) + #define MAYBE_UNUSED __attribute__((unused)) +#else + #define MAYBE_UNUSED +#endif + +#if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly + #define DELETED +#else + #define DELETED = delete +#endif + +// -- GODOT start -- +#ifndef likely +// -- GODOT end -- +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#define likely(expr) (expr) +#define unlikely(expr) (expr) +#else +#define likely(expr) __builtin_expect((bool)(expr),true ) +#define unlikely(expr) __builtin_expect((bool)(expr),false) +#endif +// -- GODOT start -- +#endif +// -- GODOT end -- + +//////////////////////////////////////////////////////////////////////////////// +/// Error handling and debugging +//////////////////////////////////////////////////////////////////////////////// + +/* debug printing macros */ +#define STRING(x) #x +#define TOSTRING(x) STRING(x) +#define PING embree_cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl +#define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl +#define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl +#define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl +#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl + +#if defined(DEBUG) // only report file and line in debug mode + // -- GODOT start -- + // #define THROW_RUNTIME_ERROR(str) + // throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); + #define THROW_RUNTIME_ERROR(str) \ + printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); + // -- GODOT end -- +#else + // -- GODOT start -- + // #define THROW_RUNTIME_ERROR(str) + // throw std::runtime_error(str); + #define THROW_RUNTIME_ERROR(str) \ + abort(); + // -- GODOT end -- +#endif + +#define FATAL(x) THROW_RUNTIME_ERROR(x) +#define WARNING(x) { std::cerr << "Warning: " << x << embree_endl << std::flush; } + +#define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented") + +//////////////////////////////////////////////////////////////////////////////// +/// Basic types +//////////////////////////////////////////////////////////////////////////////// + +/* default floating-point type */ +namespace embree { + typedef float real; +} + +/* windows does not have ssize_t */ +#if defined(__WIN32__) +#if defined(__X86_64__) || defined(__aarch64__) +typedef int64_t ssize_t; +#else +typedef int32_t ssize_t; +#endif +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Basic utility functions +//////////////////////////////////////////////////////////////////////////////// + +__forceinline std::string toString(long long value) { + return std::to_string(value); +} + +//////////////////////////////////////////////////////////////////////////////// +/// Disable some compiler warnings +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__INTEL_COMPILER) +//#pragma warning(disable:265 ) // floating-point operation result is out of range +//#pragma warning(disable:383 ) // value copied to temporary, reference to temporary used +//#pragma warning(disable:869 ) // parameter was never referenced +//#pragma warning(disable:981 ) // operands are evaluated in unspecified order +//#pragma warning(disable:1418) // external function definition with no prior declaration +//#pragma warning(disable:1419) // external declaration in primary source file +//#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable +//#pragma warning(disable:94 ) // the size of an array must be greater than zero +//#pragma warning(disable:1599) // declaration hides parameter +//#pragma warning(disable:424 ) // extra ";" ignored +#pragma warning(disable:2196) // routine is both "inline" and "noinline" +//#pragma warning(disable:177 ) // label was declared but never referenced +//#pragma warning(disable:114 ) // function was referenced but not defined +//#pragma warning(disable:819 ) // template nesting depth does not match the previous declaration of function +#pragma warning(disable:15335) // was not vectorized: vectorization possible but seems inefficient +#endif + +#if defined(_MSC_VER) +//#pragma warning(disable:4200) // nonstandard extension used : zero-sized array in struct/union +#pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning) +//#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data +#pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data +//#pragma warning(disable:4355) // 'this' : used in base member initializer list +//#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch +//#pragma warning(disable:4018) // '<' : signed / unsigned mismatch +//#pragma warning(disable:4305) // 'initializing' : truncation from 'double' to 'float' +//#pragma warning(disable:4068) // unknown pragma +//#pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned +//#pragma warning(disable:4838) // conversion from 'unsigned int' to 'const int' requires a narrowing conversion) +//#pragma warning(disable:4227) // anachronism used : qualifiers on reference are ignored +#pragma warning(disable:4503) // decorated name length exceeded, name was truncated +#pragma warning(disable:4180) // qualifier applied to function type has no meaning; ignored +#pragma warning(disable:4258) // definition from the for loop is ignored; the definition from the enclosing scope is used + +# if _MSC_VER < 1910 // prior to Visual studio 2017 (V141) +# pragma warning(disable:4101) // warning C4101: 'x': unreferenced local variable // a compiler bug issues wrong warnings +# pragma warning(disable:4789) // buffer '' of size 8 bytes will be overrun; 32 bytes will be written starting at offset 0 +# endif + +#endif + +#if defined(__clang__) && !defined(__INTEL_COMPILER) +//#pragma clang diagnostic ignored "-Wunknown-pragmas" +//#pragma clang diagnostic ignored "-Wunused-variable" +//#pragma clang diagnostic ignored "-Wreorder" +//#pragma clang diagnostic ignored "-Wmicrosoft" +//#pragma clang diagnostic ignored "-Wunused-private-field" +//#pragma clang diagnostic ignored "-Wunused-local-typedef" +//#pragma clang diagnostic ignored "-Wunused-function" +//#pragma clang diagnostic ignored "-Wnarrowing" +//#pragma clang diagnostic ignored "-Wc++11-narrowing" +//#pragma clang diagnostic ignored "-Wdeprecated-register" +//#pragma clang diagnostic ignored "-Wdeprecated-declarations" +#endif + +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) +#pragma GCC diagnostic ignored "-Wpragmas" +//#pragma GCC diagnostic ignored "-Wnarrowing" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +//#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +//#pragma GCC diagnostic ignored "-Warray-bounds" +#pragma GCC diagnostic ignored "-Wattributes" +#pragma GCC diagnostic ignored "-Wmisleading-indentation" +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma GCC diagnostic ignored "-Wparentheses" +#endif + +#if defined(__clang__) && defined(__WIN32__) +#pragma clang diagnostic ignored "-Wunused-parameter" +#pragma clang diagnostic ignored "-Wmicrosoft-cast" +#pragma clang diagnostic ignored "-Wmicrosoft-enum-value" +#pragma clang diagnostic ignored "-Wmicrosoft-include" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wunknown-pragmas" +#endif + +/* disabling deprecated warning, please use only where use of deprecated Embree API functions is desired */ +#if defined(__WIN32__) && defined(__INTEL_COMPILER) +#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 1478)) // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable: 1478)) // warning: function was declared deprecated +#elif defined(__INTEL_COMPILER) +#define DISABLE_DEPRECATED_WARNING _Pragma("warning (disable: 1478)") // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("warning (enable : 1478)") // warning: function was declared deprecated +#elif defined(__clang__) +#define DISABLE_DEPRECATED_WARNING _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("clang diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#elif defined(__GNUC__) +#define DISABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#define ENABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated +#elif defined(_MSC_VER) +#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated +#define ENABLE_DEPRECATED_WARNING __pragma(warning (enable : 4996)) // warning: function was declared deprecated +#endif + +/* embree output stream */ +#define embree_ostream std::ostream& +#define embree_cout std::cout +#define embree_cout_uniform std::cout +#define embree_endl std::endl + +//////////////////////////////////////////////////////////////////////////////// +/// Some macros for static profiling +//////////////////////////////////////////////////////////////////////////////// + +#if defined (__GNUC__) +#define IACA_SSC_MARK( MARK_ID ) \ +__asm__ __volatile__ ( \ + "\n\t movl $"#MARK_ID", %%ebx" \ + "\n\t .byte 0x64, 0x67, 0x90" \ + : : : "memory" ); + +#define IACA_UD_BYTES __asm__ __volatile__ ("\n\t .byte 0x0F, 0x0B"); + +#else +#define IACA_UD_BYTES {__asm _emit 0x0F \ + __asm _emit 0x0B} + +#define IACA_SSC_MARK(x) {__asm mov ebx, x\ + __asm _emit 0x64 \ + __asm _emit 0x67 \ + __asm _emit 0x90 } + +#define IACA_VC64_START __writegsbyte(111, 111); +#define IACA_VC64_END __writegsbyte(222, 222); + +#endif + +#define IACA_START {IACA_UD_BYTES \ + IACA_SSC_MARK(111)} +#define IACA_END {IACA_SSC_MARK(222) \ + IACA_UD_BYTES} + +namespace embree +{ + template + struct OnScopeExitHelper + { + OnScopeExitHelper (const Closure f) : active(true), f(f) {} + ~OnScopeExitHelper() { if (active) f(); } + void deactivate() { active = false; } + bool active; + const Closure f; + }; + + template + OnScopeExitHelper OnScopeExit(const Closure f) { + return OnScopeExitHelper(f); + } + +#define STRING_JOIN2(arg1, arg2) DO_STRING_JOIN2(arg1, arg2) +#define DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2 +#define ON_SCOPE_EXIT(code) \ + auto STRING_JOIN2(on_scope_exit_, __LINE__) = OnScopeExit([&](){code;}) + + template + std::unique_ptr make_unique(Ty* ptr) { + return std::unique_ptr(ptr); + } + +} diff --git a/thirdparty/embree-aarch64/common/sys/ref.h b/thirdparty/embree-aarch64/common/sys/ref.h new file mode 100644 index 00000000000..24648e62344 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/ref.h @@ -0,0 +1,122 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "atomic.h" + +namespace embree +{ + struct NullTy { + }; + + extern MAYBE_UNUSED NullTy null; + + class RefCount + { + public: + RefCount(int val = 0) : refCounter(val) {} + virtual ~RefCount() {}; + + virtual RefCount* refInc() { refCounter.fetch_add(1); return this; } + virtual void refDec() { if (refCounter.fetch_add(-1) == 1) delete this; } + private: + std::atomic refCounter; + }; + + //////////////////////////////////////////////////////////////////////////////// + /// Reference to single object + //////////////////////////////////////////////////////////////////////////////// + + template + class Ref + { + public: + Type* ptr; + + //////////////////////////////////////////////////////////////////////////////// + /// Constructors, Assignment & Cast Operators + //////////////////////////////////////////////////////////////////////////////// + + __forceinline Ref() : ptr(nullptr) {} + __forceinline Ref(NullTy) : ptr(nullptr) {} + __forceinline Ref(const Ref& input) : ptr(input.ptr) { if (ptr) ptr->refInc(); } + __forceinline Ref(Ref&& input) : ptr(input.ptr) { input.ptr = nullptr; } + + __forceinline Ref(Type* const input) : ptr(input) + { + if (ptr) + ptr->refInc(); + } + + __forceinline ~Ref() + { + if (ptr) + ptr->refDec(); + } + + __forceinline Ref& operator =(const Ref& input) + { + if (input.ptr) + input.ptr->refInc(); + if (ptr) + ptr->refDec(); + ptr = input.ptr; + return *this; + } + + __forceinline Ref& operator =(Ref&& input) + { + if (ptr) + ptr->refDec(); + ptr = input.ptr; + input.ptr = nullptr; + return *this; + } + + __forceinline Ref& operator =(Type* const input) + { + if (input) + input->refInc(); + if (ptr) + ptr->refDec(); + ptr = input; + return *this; + } + + __forceinline Ref& operator =(NullTy) + { + if (ptr) + ptr->refDec(); + ptr = nullptr; + return *this; + } + + __forceinline operator bool() const { return ptr != nullptr; } + + __forceinline const Type& operator *() const { return *ptr; } + __forceinline Type& operator *() { return *ptr; } + __forceinline const Type* operator ->() const { return ptr; } + __forceinline Type* operator ->() { return ptr; } + + template + __forceinline Ref cast() { return Ref(static_cast(ptr)); } + template + __forceinline const Ref cast() const { return Ref(static_cast(ptr)); } + + template + __forceinline Ref dynamicCast() { return Ref(dynamic_cast(ptr)); } + template + __forceinline const Ref dynamicCast() const { return Ref(dynamic_cast(ptr)); } + }; + + template __forceinline bool operator < (const Ref& a, const Ref& b) { return a.ptr < b.ptr; } + + template __forceinline bool operator ==(const Ref& a, NullTy ) { return a.ptr == nullptr; } + template __forceinline bool operator ==(NullTy , const Ref& b) { return nullptr == b.ptr; } + template __forceinline bool operator ==(const Ref& a, const Ref& b) { return a.ptr == b.ptr; } + + template __forceinline bool operator !=(const Ref& a, NullTy ) { return a.ptr != nullptr; } + template __forceinline bool operator !=(NullTy , const Ref& b) { return nullptr != b.ptr; } + template __forceinline bool operator !=(const Ref& a, const Ref& b) { return a.ptr != b.ptr; } +} diff --git a/thirdparty/embree-aarch64/common/sys/regression.cpp b/thirdparty/embree-aarch64/common/sys/regression.cpp new file mode 100644 index 00000000000..d95ff8dfe06 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/regression.cpp @@ -0,0 +1,30 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "regression.h" + +namespace embree +{ + /* registerRegressionTest is invoked from static initializers, thus + * we cannot have the regression_tests variable as global static + * variable due to issues with static variable initialization + * order. */ + std::vector& get_regression_tests() + { + static std::vector regression_tests; + return regression_tests; + } + + void registerRegressionTest(RegressionTest* test) + { + get_regression_tests().push_back(test); + } + + RegressionTest* getRegressionTest(size_t index) + { + if (index >= get_regression_tests().size()) + return nullptr; + + return get_regression_tests()[index]; + } +} diff --git a/thirdparty/embree-aarch64/common/sys/regression.h b/thirdparty/embree-aarch64/common/sys/regression.h new file mode 100644 index 00000000000..632f8d92cf5 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/regression.h @@ -0,0 +1,25 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" + +#include + +namespace embree +{ + /*! virtual interface for all regression tests */ + struct RegressionTest + { + RegressionTest (std::string name) : name(name) {} + virtual bool run() = 0; + std::string name; + }; + + /*! registers a regression test */ + void registerRegressionTest(RegressionTest* test); + + /*! run all regression tests */ + RegressionTest* getRegressionTest(size_t index); +} diff --git a/thirdparty/embree-aarch64/common/sys/string.cpp b/thirdparty/embree-aarch64/common/sys/string.cpp new file mode 100644 index 00000000000..931244383e0 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/string.cpp @@ -0,0 +1,42 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "string.h" + +#include +#include + +namespace embree +{ + char to_lower(char c) { return char(tolower(int(c))); } + char to_upper(char c) { return char(toupper(int(c))); } + std::string toLowerCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_lower); return dst; } + std::string toUpperCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_upper); return dst; } + + Vec2f string_to_Vec2f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); + return Vec2f(x,y); + } + + Vec3f string_to_Vec3f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); str = str.substr(next+1); + const float z = std::stof(str,&next); + return Vec3f(x,y,z); + } + + Vec4f string_to_Vec4f ( std::string str ) + { + size_t next = 0; + const float x = std::stof(str,&next); str = str.substr(next+1); + const float y = std::stof(str,&next); str = str.substr(next+1); + const float z = std::stof(str,&next); str = str.substr(next+1); + const float w = std::stof(str,&next); + return Vec4f(x,y,z,w); + } +} diff --git a/thirdparty/embree-aarch64/common/sys/string.h b/thirdparty/embree-aarch64/common/sys/string.h new file mode 100644 index 00000000000..2e9b0f88c32 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/string.h @@ -0,0 +1,37 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "../math/vec2.h" +#include "../math/vec3.h" +#include "../math/vec4.h" + +namespace embree +{ + class IOStreamStateRestorer + { + public: + IOStreamStateRestorer(std::ostream& iostream) + : iostream(iostream), flags(iostream.flags()), precision(iostream.precision()) { + } + + ~IOStreamStateRestorer() { + iostream.flags(flags); + iostream.precision(precision); + } + + private: + std::ostream& iostream; + std::ios::fmtflags flags; + std::streamsize precision; + }; + + std::string toLowerCase(const std::string& s); + std::string toUpperCase(const std::string& s); + + Vec2f string_to_Vec2f ( std::string str ); + Vec3f string_to_Vec3f ( std::string str ); + Vec4f string_to_Vec4f ( std::string str ); +} diff --git a/thirdparty/embree-aarch64/common/sys/sysinfo.cpp b/thirdparty/embree-aarch64/common/sys/sysinfo.cpp new file mode 100644 index 00000000000..1d11436770c --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/sysinfo.cpp @@ -0,0 +1,676 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "sysinfo.h" +#include "intrinsics.h" +#include "string.h" +#include "ref.h" +#if defined(__FREEBSD__) +#include +#include +typedef cpuset_t cpu_set_t; +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// All Platforms +//////////////////////////////////////////////////////////////////////////////// + +namespace embree +{ + NullTy null; + + std::string getPlatformName() + { +#if defined(__LINUX__) && defined(__ANDROID__) && defined(__aarch64__) && defined(__ARM_NEON) + return "Android Linux (aarch64 / arm64)"; +#elif defined(__LINUX__) && defined(__ANDROID__) && defined(__X86_64__) + return "Android Linux (x64)"; +#elif defined(__LINUX__) && defined(__ANDROID__) && (defined(_X86_) || defined(__X86__) || defined(_M_IX86)) + return "Android Linux (x86)"; +#elif defined(__LINUX__) && !defined(__X86_64__) + return "Linux (32bit)"; +#elif defined(__LINUX__) && defined(__X86_64__) + return "Linux (64bit)"; +#elif defined(__FREEBSD__) && !defined(__X86_64__) + return "FreeBSD (32bit)"; +#elif defined(__FREEBSD__) && defined(__X86_64__) + return "FreeBSD (64bit)"; +#elif defined(__CYGWIN__) && !defined(__X86_64__) + return "Cygwin (32bit)"; +#elif defined(__CYGWIN__) && defined(__X86_64__) + return "Cygwin (64bit)"; +#elif defined(__WIN32__) && !defined(__X86_64__) + return "Windows (32bit)"; +#elif defined(__WIN32__) && defined(__X86_64__) + return "Windows (64bit)"; +#elif defined(TARGET_IPHONE_SIMULATOR) && defined(__X86_64__) + return "iOS Simulator (x64)"; +#elif defined(TARGET_OS_IPHONE) && defined(__aarch64__) && defined(__ARM_NEON) + return "iOS (aarch64 / arm64)"; +#elif defined(__MACOSX__) && !defined(__X86_64__) + return "Mac OS X (32bit)"; +#elif defined(__MACOSX__) && defined(__X86_64__) + return "Mac OS X (64bit)"; +#elif defined(__UNIX__) && defined(__aarch64__) + return "Unix (aarch64)"; +#elif defined(__UNIX__) && !defined(__X86_64__) + return "Unix (32bit)"; +#elif defined(__UNIX__) && defined(__X86_64__) + return "Unix (64bit)"; +#else + return "Unknown"; +#endif + } + + std::string getCompilerName() + { +#if defined(__INTEL_COMPILER) + int icc_mayor = __INTEL_COMPILER / 100 % 100; + int icc_minor = __INTEL_COMPILER % 100; + std::string version = "Intel Compiler "; + version += toString(icc_mayor); + version += "." + toString(icc_minor); +#if defined(__INTEL_COMPILER_UPDATE) + version += "." + toString(__INTEL_COMPILER_UPDATE); +#endif + return version; +#elif defined(__clang__) + return "CLANG " __clang_version__; +#elif defined (__GNUC__) + return "GCC " __VERSION__; +#elif defined(_MSC_VER) + std::string version = toString(_MSC_FULL_VER); + version.insert(4,"."); + version.insert(9,"."); + version.insert(2,"."); + return "Visual C++ Compiler " + version; +#else + return "Unknown Compiler"; +#endif + } + + std::string getCPUVendor() + { + int cpuinfo[4]; + __cpuid (cpuinfo, 0); + int name[4]; + name[0] = cpuinfo[1]; + name[1] = cpuinfo[3]; + name[2] = cpuinfo[2]; + name[3] = 0; + return (char*)name; + } + + CPU getCPUModel() + { + if (getCPUVendor() != "GenuineIntel") + return CPU::UNKNOWN; + + int out[4]; + __cpuid(out, 0); + if (out[0] < 1) return CPU::UNKNOWN; + __cpuid(out, 1); + + /* please see CPUID documentation for these formulas */ + uint32_t family_ID = (out[0] >> 8) & 0x0F; + uint32_t extended_family_ID = (out[0] >> 20) & 0xFF; + + uint32_t model_ID = (out[0] >> 4) & 0x0F; + uint32_t extended_model_ID = (out[0] >> 16) & 0x0F; + + uint32_t DisplayFamily = family_ID; + if (family_ID == 0x0F) + DisplayFamily += extended_family_ID; + + uint32_t DisplayModel = model_ID; + if (family_ID == 0x06 || family_ID == 0x0F) + DisplayModel += extended_model_ID << 4; + + uint32_t DisplayFamily_DisplayModel = (DisplayFamily << 8) + (DisplayModel << 0); + + // Data from Intel® 64 and IA-32 Architectures, Volume 4, Chapter 2, Table 2-1 (CPUID Signature Values of DisplayFamily_DisplayModel) + if (DisplayFamily_DisplayModel == 0x067D) return CPU::CORE_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x067E) return CPU::CORE_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x068C) return CPU::CORE_TIGER_LAKE; + if (DisplayFamily_DisplayModel == 0x06A5) return CPU::CORE_COMET_LAKE; + if (DisplayFamily_DisplayModel == 0x06A6) return CPU::CORE_COMET_LAKE; + if (DisplayFamily_DisplayModel == 0x0666) return CPU::CORE_CANNON_LAKE; + if (DisplayFamily_DisplayModel == 0x068E) return CPU::CORE_KABY_LAKE; + if (DisplayFamily_DisplayModel == 0x069E) return CPU::CORE_KABY_LAKE; + if (DisplayFamily_DisplayModel == 0x066A) return CPU::XEON_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x066C) return CPU::XEON_ICE_LAKE; + if (DisplayFamily_DisplayModel == 0x0655) return CPU::XEON_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x064E) return CPU::CORE_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x065E) return CPU::CORE_SKY_LAKE; + if (DisplayFamily_DisplayModel == 0x0656) return CPU::XEON_BROADWELL; + if (DisplayFamily_DisplayModel == 0x064F) return CPU::XEON_BROADWELL; + if (DisplayFamily_DisplayModel == 0x0647) return CPU::CORE_BROADWELL; + if (DisplayFamily_DisplayModel == 0x063D) return CPU::CORE_BROADWELL; + if (DisplayFamily_DisplayModel == 0x063F) return CPU::XEON_HASWELL; + if (DisplayFamily_DisplayModel == 0x063C) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x0645) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x0646) return CPU::CORE_HASWELL; + if (DisplayFamily_DisplayModel == 0x063E) return CPU::XEON_IVY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x063A) return CPU::CORE_IVY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062D) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062F) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062A) return CPU::SANDY_BRIDGE; + if (DisplayFamily_DisplayModel == 0x062E) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x0625) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x062C) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061E) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061F) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061A) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x061D) return CPU::NEHALEM; + if (DisplayFamily_DisplayModel == 0x0617) return CPU::CORE2; + if (DisplayFamily_DisplayModel == 0x060F) return CPU::CORE2; + if (DisplayFamily_DisplayModel == 0x060E) return CPU::CORE1; + + if (DisplayFamily_DisplayModel == 0x0685) return CPU::XEON_PHI_KNIGHTS_MILL; + if (DisplayFamily_DisplayModel == 0x0657) return CPU::XEON_PHI_KNIGHTS_LANDING; + + return CPU::UNKNOWN; + } + + std::string stringOfCPUModel(CPU model) + { + switch (model) { + case CPU::XEON_ICE_LAKE : return "Xeon Ice Lake"; + case CPU::CORE_ICE_LAKE : return "Core Ice Lake"; + case CPU::CORE_TIGER_LAKE : return "Core Tiger Lake"; + case CPU::CORE_COMET_LAKE : return "Core Comet Lake"; + case CPU::CORE_CANNON_LAKE : return "Core Cannon Lake"; + case CPU::CORE_KABY_LAKE : return "Core Kaby Lake"; + case CPU::XEON_SKY_LAKE : return "Xeon Sky Lake"; + case CPU::CORE_SKY_LAKE : return "Core Sky Lake"; + case CPU::XEON_PHI_KNIGHTS_MILL : return "Xeon Phi Knights Mill"; + case CPU::XEON_PHI_KNIGHTS_LANDING: return "Xeon Phi Knights Landing"; + case CPU::XEON_BROADWELL : return "Xeon Broadwell"; + case CPU::CORE_BROADWELL : return "Core Broadwell"; + case CPU::XEON_HASWELL : return "Xeon Haswell"; + case CPU::CORE_HASWELL : return "Core Haswell"; + case CPU::XEON_IVY_BRIDGE : return "Xeon Ivy Bridge"; + case CPU::CORE_IVY_BRIDGE : return "Core Ivy Bridge"; + case CPU::SANDY_BRIDGE : return "Sandy Bridge"; + case CPU::NEHALEM : return "Nehalem"; + case CPU::CORE2 : return "Core2"; + case CPU::CORE1 : return "Core"; + case CPU::ARM : return "Arm"; + case CPU::UNKNOWN : return "Unknown CPU"; + } + return "Unknown CPU (error)"; + } + +#if !defined(__ARM_NEON) + /* constants to access destination registers of CPUID instruction */ + static const int EAX = 0; + static const int EBX = 1; + static const int ECX = 2; + static const int EDX = 3; + + /* cpuid[eax=1].ecx */ + static const int CPU_FEATURE_BIT_SSE3 = 1 << 0; + static const int CPU_FEATURE_BIT_SSSE3 = 1 << 9; + static const int CPU_FEATURE_BIT_FMA3 = 1 << 12; + static const int CPU_FEATURE_BIT_SSE4_1 = 1 << 19; + static const int CPU_FEATURE_BIT_SSE4_2 = 1 << 20; + //static const int CPU_FEATURE_BIT_MOVBE = 1 << 22; + static const int CPU_FEATURE_BIT_POPCNT = 1 << 23; + //static const int CPU_FEATURE_BIT_XSAVE = 1 << 26; + static const int CPU_FEATURE_BIT_OXSAVE = 1 << 27; + static const int CPU_FEATURE_BIT_AVX = 1 << 28; + static const int CPU_FEATURE_BIT_F16C = 1 << 29; + static const int CPU_FEATURE_BIT_RDRAND = 1 << 30; + + /* cpuid[eax=1].edx */ + static const int CPU_FEATURE_BIT_SSE = 1 << 25; + static const int CPU_FEATURE_BIT_SSE2 = 1 << 26; + + /* cpuid[eax=0x80000001].ecx */ + static const int CPU_FEATURE_BIT_LZCNT = 1 << 5; + + /* cpuid[eax=7,ecx=0].ebx */ + static const int CPU_FEATURE_BIT_BMI1 = 1 << 3; + static const int CPU_FEATURE_BIT_AVX2 = 1 << 5; + static const int CPU_FEATURE_BIT_BMI2 = 1 << 8; + static const int CPU_FEATURE_BIT_AVX512F = 1 << 16; // AVX512F (foundation) + static const int CPU_FEATURE_BIT_AVX512DQ = 1 << 17; // AVX512DQ (doubleword and quadword instructions) + static const int CPU_FEATURE_BIT_AVX512PF = 1 << 26; // AVX512PF (prefetch gather/scatter instructions) + static const int CPU_FEATURE_BIT_AVX512ER = 1 << 27; // AVX512ER (exponential and reciprocal instructions) + static const int CPU_FEATURE_BIT_AVX512CD = 1 << 28; // AVX512CD (conflict detection instructions) + static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30; // AVX512BW (byte and word instructions) + static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31; // AVX512VL (vector length extensions) + static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21; // AVX512IFMA (integer fused multiple-add instructions) + + /* cpuid[eax=7,ecx=0].ecx */ + static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1; // AVX512VBMI (vector bit manipulation instructions) +#endif + +#if !defined(__ARM_NEON) + __noinline int64_t get_xcr0() + { + // https://github.com/opencv/opencv/blob/master/modules/core/src/system.cpp#L466 +#if defined (__WIN32__) && defined(_XCR_XFEATURE_ENABLED_MASK) + int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32 + xcr0 = _xgetbv(0); + return xcr0; +#else + int xcr0 = 0; + __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); + return xcr0; +#endif + } +#endif + + int getCPUFeatures() + { +#if defined(__ARM_NEON) + int cpu_features = CPU_FEATURE_NEON|CPU_FEATURE_SSE|CPU_FEATURE_SSE2; +#if defined(NEON_AVX2_EMULATION) + cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE42; + cpu_features |= CPU_FEATURE_XMM_ENABLED; + cpu_features |= CPU_FEATURE_YMM_ENABLED; + cpu_features |= CPU_FEATURE_SSE41 | CPU_FEATURE_RDRAND | CPU_FEATURE_F16C; + cpu_features |= CPU_FEATURE_POPCNT; + cpu_features |= CPU_FEATURE_AVX; + cpu_features |= CPU_FEATURE_AVX2; + cpu_features |= CPU_FEATURE_FMA3; + cpu_features |= CPU_FEATURE_LZCNT; + cpu_features |= CPU_FEATURE_BMI1; + cpu_features |= CPU_FEATURE_BMI2; + cpu_features |= CPU_FEATURE_NEON_2X; + + + +#endif + return cpu_features; + +#else + /* cache CPU features access */ + static int cpu_features = 0; + if (cpu_features) + return cpu_features; + + /* get number of CPUID leaves */ + int cpuid_leaf0[4]; + __cpuid(cpuid_leaf0, 0x00000000); + unsigned nIds = cpuid_leaf0[EAX]; + + /* get number of extended CPUID leaves */ + int cpuid_leafe[4]; + __cpuid(cpuid_leafe, 0x80000000); + unsigned nExIds = cpuid_leafe[EAX]; + + /* get CPUID leaves for EAX = 1,7, and 0x80000001 */ + int cpuid_leaf_1[4] = { 0,0,0,0 }; + int cpuid_leaf_7[4] = { 0,0,0,0 }; + int cpuid_leaf_e1[4] = { 0,0,0,0 }; + if (nIds >= 1) __cpuid (cpuid_leaf_1,0x00000001); +#if _WIN32 +#if _MSC_VER && (_MSC_FULL_VER < 160040219) +#else + if (nIds >= 7) __cpuidex(cpuid_leaf_7,0x00000007,0); +#endif +#else + if (nIds >= 7) __cpuid_count(cpuid_leaf_7,0x00000007,0); +#endif + if (nExIds >= 0x80000001) __cpuid(cpuid_leaf_e1,0x80000001); + + /* detect if OS saves XMM, YMM, and ZMM states */ + bool xmm_enabled = true; + bool ymm_enabled = false; + bool zmm_enabled = false; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_OXSAVE) { + int64_t xcr0 = get_xcr0(); + xmm_enabled = ((xcr0 & 0x02) == 0x02); /* checks if xmm are enabled in XCR0 */ + ymm_enabled = xmm_enabled && ((xcr0 & 0x04) == 0x04); /* checks if ymm state are enabled in XCR0 */ + zmm_enabled = ymm_enabled && ((xcr0 & 0xE0) == 0xE0); /* checks if OPMASK state, upper 256-bit of ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled in XCR0 */ + } + if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED; + if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED; + if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED; + + if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE ) cpu_features |= CPU_FEATURE_SSE; + if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2 ) cpu_features |= CPU_FEATURE_SSE2; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3 ) cpu_features |= CPU_FEATURE_SSE3; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSSE3 ) cpu_features |= CPU_FEATURE_SSSE3; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX ) cpu_features |= CPU_FEATURE_AVX; + + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C ) cpu_features |= CPU_FEATURE_F16C; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2 ) cpu_features |= CPU_FEATURE_AVX2; + if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_FMA3 ) cpu_features |= CPU_FEATURE_FMA3; + if (cpuid_leaf_e1[ECX] & CPU_FEATURE_BIT_LZCNT) cpu_features |= CPU_FEATURE_LZCNT; + if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI1 ) cpu_features |= CPU_FEATURE_BMI1; + if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI2 ) cpu_features |= CPU_FEATURE_BMI2; + + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F ) cpu_features |= CPU_FEATURE_AVX512F; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ ) cpu_features |= CPU_FEATURE_AVX512DQ; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF ) cpu_features |= CPU_FEATURE_AVX512PF; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER ) cpu_features |= CPU_FEATURE_AVX512ER; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD ) cpu_features |= CPU_FEATURE_AVX512CD; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW ) cpu_features |= CPU_FEATURE_AVX512BW; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA; + if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512VL ) cpu_features |= CPU_FEATURE_AVX512VL; + if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI; + + return cpu_features; +#endif + } + + std::string stringOfCPUFeatures(int features) + { + std::string str; + if (features & CPU_FEATURE_XMM_ENABLED) str += "XMM "; + if (features & CPU_FEATURE_YMM_ENABLED) str += "YMM "; + if (features & CPU_FEATURE_ZMM_ENABLED) str += "ZMM "; + if (features & CPU_FEATURE_SSE ) str += "SSE "; + if (features & CPU_FEATURE_SSE2 ) str += "SSE2 "; + if (features & CPU_FEATURE_SSE3 ) str += "SSE3 "; + if (features & CPU_FEATURE_SSSE3 ) str += "SSSE3 "; + if (features & CPU_FEATURE_SSE41 ) str += "SSE4.1 "; + if (features & CPU_FEATURE_SSE42 ) str += "SSE4.2 "; + if (features & CPU_FEATURE_POPCNT) str += "POPCNT "; + if (features & CPU_FEATURE_AVX ) str += "AVX "; + if (features & CPU_FEATURE_F16C ) str += "F16C "; + if (features & CPU_FEATURE_RDRAND) str += "RDRAND "; + if (features & CPU_FEATURE_AVX2 ) str += "AVX2 "; + if (features & CPU_FEATURE_FMA3 ) str += "FMA3 "; + if (features & CPU_FEATURE_LZCNT ) str += "LZCNT "; + if (features & CPU_FEATURE_BMI1 ) str += "BMI1 "; + if (features & CPU_FEATURE_BMI2 ) str += "BMI2 "; + if (features & CPU_FEATURE_AVX512F) str += "AVX512F "; + if (features & CPU_FEATURE_AVX512DQ) str += "AVX512DQ "; + if (features & CPU_FEATURE_AVX512PF) str += "AVX512PF "; + if (features & CPU_FEATURE_AVX512ER) str += "AVX512ER "; + if (features & CPU_FEATURE_AVX512CD) str += "AVX512CD "; + if (features & CPU_FEATURE_AVX512BW) str += "AVX512BW "; + if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL "; + if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA "; + if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI "; + if (features & CPU_FEATURE_NEON) str += "NEON "; + if (features & CPU_FEATURE_NEON_2X) str += "2xNEON "; + return str; + } + + std::string stringOfISA (int isa) + { + if (isa == SSE) return "SSE"; + if (isa == SSE2) return "SSE2"; + if (isa == SSE3) return "SSE3"; + if (isa == SSSE3) return "SSSE3"; + if (isa == SSE41) return "SSE4.1"; + if (isa == SSE42) return "SSE4.2"; + if (isa == AVX) return "AVX"; + if (isa == AVX2) return "AVX2"; + if (isa == AVX512KNL) return "AVX512KNL"; + if (isa == AVX512SKX) return "AVX512SKX"; + if (isa == NEON) return "NEON"; + if (isa == NEON_2X) return "2xNEON"; + return "UNKNOWN"; + } + + bool hasISA(int features, int isa) { + return (features & isa) == isa; + } + + std::string supportedTargetList (int features) + { + std::string v; + if (hasISA(features,SSE)) v += "SSE "; + if (hasISA(features,SSE2)) v += "SSE2 "; + if (hasISA(features,SSE3)) v += "SSE3 "; + if (hasISA(features,SSSE3)) v += "SSSE3 "; + if (hasISA(features,SSE41)) v += "SSE4.1 "; + if (hasISA(features,SSE42)) v += "SSE4.2 "; + if (hasISA(features,AVX)) v += "AVX "; + if (hasISA(features,AVXI)) v += "AVXI "; + if (hasISA(features,AVX2)) v += "AVX2 "; + if (hasISA(features,AVX512KNL)) v += "AVX512KNL "; + if (hasISA(features,AVX512SKX)) v += "AVX512SKX "; + if (hasISA(features,NEON)) v += "NEON "; + if (hasISA(features,NEON_2X)) v += "2xNEON "; + return v; + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include +#include + +namespace embree +{ + std::string getExecutableFileName() { + char filename[1024]; + if (!GetModuleFileName(nullptr, filename, sizeof(filename))) + return std::string(); + return std::string(filename); + } + + unsigned int getNumberOfLogicalThreads() + { + static int nThreads = -1; + if (nThreads != -1) return nThreads; + + typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)(); + typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD); + HMODULE hlib = LoadLibrary("Kernel32"); + GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); + GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc) GetProcAddress(hlib, "GetActiveProcessorCount"); + + if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount) + { + int groups = pGetActiveProcessorGroupCount(); + int totalProcessors = 0; + for (int i = 0; i < groups; i++) + totalProcessors += pGetActiveProcessorCount(i); + nThreads = totalProcessors; + } + else + { + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + nThreads = sysinfo.dwNumberOfProcessors; + } + assert(nThreads); + return nThreads; + } + + int getTerminalWidth() + { + HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE); + if (handle == INVALID_HANDLE_VALUE) return 80; + CONSOLE_SCREEN_BUFFER_INFO info; + memset(&info,0,sizeof(info)); + GetConsoleScreenBufferInfo(handle, &info); + return info.dwSize.X; + } + + double getSeconds() + { + LARGE_INTEGER freq, val; + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&val); + return (double)val.QuadPart / (double)freq.QuadPart; + } + + void sleepSeconds(double t) { + Sleep(DWORD(1000.0*t)); + } + + size_t getVirtualMemoryBytes() + { + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); + return (size_t)info.QuotaPeakPagedPoolUsage; + } + + size_t getResidentMemoryBytes() + { + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); + return (size_t)info.WorkingSetSize; + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Linux Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__LINUX__) + +#include +#include + +namespace embree +{ + std::string getExecutableFileName() + { + std::string pid = "/proc/" + toString(getpid()) + "/exe"; + char buf[4096]; + memset(buf,0,sizeof(buf)); + if (readlink(pid.c_str(), buf, sizeof(buf)-1) == -1) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() + { + size_t virt, resident, shared; + std::ifstream buffer("/proc/self/statm"); + buffer >> virt >> resident >> shared; + return virt*sysconf(_SC_PAGE_SIZE); + } + + size_t getResidentMemoryBytes() + { + size_t virt, resident, shared; + std::ifstream buffer("/proc/self/statm"); + buffer >> virt >> resident >> shared; + return resident*sysconf(_SC_PAGE_SIZE); + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// FreeBSD Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined (__FreeBSD__) + +#include + +namespace embree +{ + std::string getExecutableFileName() + { + const int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1 }; + char buf[4096]; + memset(buf,0,sizeof(buf)); + size_t len = sizeof(buf)-1; + if (sysctl(mib, 4, buf, &len, 0x0, 0) == -1) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() { + return 0; + } + + size_t getResidentMemoryBytes() { + return 0; + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Mac OS X Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__MACOSX__) + +#include + +namespace embree +{ + std::string getExecutableFileName() + { + char buf[4096]; + uint32_t size = sizeof(buf); + if (_NSGetExecutablePath(buf, &size) != 0) + return std::string(); + return std::string(buf); + } + + size_t getVirtualMemoryBytes() { + return 0; + } + + size_t getResidentMemoryBytes() { + return 0; + } +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) + +#include +#include +#include +#include + +namespace embree +{ + unsigned int getNumberOfLogicalThreads() + { + static int nThreads = -1; + if (nThreads != -1) return nThreads; + +#if defined(__MACOSX__) || defined(__ANDROID__) + nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container + assert(nThreads); +#else + cpu_set_t set; + if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0) + nThreads = CPU_COUNT(&set); +#endif + + assert(nThreads); + return nThreads; + } + + int getTerminalWidth() + { + struct winsize info; + if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80; + return info.ws_col; + } + + double getSeconds() { + struct timeval tp; gettimeofday(&tp,nullptr); + return double(tp.tv_sec) + double(tp.tv_usec)/1E6; + } + + void sleepSeconds(double t) { + usleep(1000000.0*t); + } +} +#endif + diff --git a/thirdparty/embree-aarch64/common/sys/sysinfo.h b/thirdparty/embree-aarch64/common/sys/sysinfo.h new file mode 100644 index 00000000000..8e313a59b32 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/sysinfo.h @@ -0,0 +1,192 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define CACHELINE_SIZE 64 + +#if !defined(PAGE_SIZE) + #define PAGE_SIZE 4096 +#endif + +#define PAGE_SIZE_2M (2*1024*1024) +#define PAGE_SIZE_4K (4*1024) + +#include "platform.h" + +/* define isa namespace and ISA bitvector */ +#if defined (__AVX512VL__) +# define isa avx512skx +# define ISA AVX512SKX +# define ISA_STR "AVX512SKX" +#elif defined (__AVX512F__) +# define isa avx512knl +# define ISA AVX512KNL +# define ISA_STR "AVX512KNL" +#elif defined (__AVX2__) +# define isa avx2 +# define ISA AVX2 +# define ISA_STR "AVX2" +#elif defined(__AVXI__) +# define isa avxi +# define ISA AVXI +# define ISA_STR "AVXI" +#elif defined(__AVX__) +# define isa avx +# define ISA AVX +# define ISA_STR "AVX" +#elif defined (__SSE4_2__) +# define isa sse42 +# define ISA SSE42 +# define ISA_STR "SSE4.2" +//#elif defined (__SSE4_1__) // we demote this to SSE2, MacOSX code compiles with SSE41 by default with XCode 11 +//# define isa sse41 +//# define ISA SSE41 +//# define ISA_STR "SSE4.1" +//#elif defined(__SSSE3__) // we demote this to SSE2, MacOSX code compiles with SSSE3 by default with ICC +//# define isa ssse3 +//# define ISA SSSE3 +//# define ISA_STR "SSSE3" +//#elif defined(__SSE3__) // we demote this to SSE2, MacOSX code compiles with SSE3 by default with clang +//# define isa sse3 +//# define ISA SSE3 +//# define ISA_STR "SSE3" +#elif defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__) +# define isa sse2 +# define ISA SSE2 +# define ISA_STR "SSE2" +#elif defined(__SSE__) +# define isa sse +# define ISA SSE +# define ISA_STR "SSE" +#elif defined(__ARM_NEON) +// NOTE(LTE): Use sse2 for `isa` for the compatibility at the moment. +#define isa sse2 +#define ISA NEON +#define ISA_STR "NEON" +#else +#error Unknown ISA +#endif + +namespace embree +{ + enum class CPU + { + XEON_ICE_LAKE, + CORE_ICE_LAKE, + CORE_TIGER_LAKE, + CORE_COMET_LAKE, + CORE_CANNON_LAKE, + CORE_KABY_LAKE, + XEON_SKY_LAKE, + CORE_SKY_LAKE, + XEON_PHI_KNIGHTS_MILL, + XEON_PHI_KNIGHTS_LANDING, + XEON_BROADWELL, + CORE_BROADWELL, + XEON_HASWELL, + CORE_HASWELL, + XEON_IVY_BRIDGE, + CORE_IVY_BRIDGE, + SANDY_BRIDGE, + NEHALEM, + CORE2, + CORE1, + ARM, + UNKNOWN, + }; + + /*! get the full path to the running executable */ + std::string getExecutableFileName(); + + /*! return platform name */ + std::string getPlatformName(); + + /*! get the full name of the compiler */ + std::string getCompilerName(); + + /*! return the name of the CPU */ + std::string getCPUVendor(); + + /*! get microprocessor model */ + CPU getCPUModel(); + + /*! converts CPU model into string */ + std::string stringOfCPUModel(CPU model); + + /*! CPU features */ + static const int CPU_FEATURE_SSE = 1 << 0; + static const int CPU_FEATURE_SSE2 = 1 << 1; + static const int CPU_FEATURE_SSE3 = 1 << 2; + static const int CPU_FEATURE_SSSE3 = 1 << 3; + static const int CPU_FEATURE_SSE41 = 1 << 4; + static const int CPU_FEATURE_SSE42 = 1 << 5; + static const int CPU_FEATURE_POPCNT = 1 << 6; + static const int CPU_FEATURE_AVX = 1 << 7; + static const int CPU_FEATURE_F16C = 1 << 8; + static const int CPU_FEATURE_RDRAND = 1 << 9; + static const int CPU_FEATURE_AVX2 = 1 << 10; + static const int CPU_FEATURE_FMA3 = 1 << 11; + static const int CPU_FEATURE_LZCNT = 1 << 12; + static const int CPU_FEATURE_BMI1 = 1 << 13; + static const int CPU_FEATURE_BMI2 = 1 << 14; + static const int CPU_FEATURE_AVX512F = 1 << 16; + static const int CPU_FEATURE_AVX512DQ = 1 << 17; + static const int CPU_FEATURE_AVX512PF = 1 << 18; + static const int CPU_FEATURE_AVX512ER = 1 << 19; + static const int CPU_FEATURE_AVX512CD = 1 << 20; + static const int CPU_FEATURE_AVX512BW = 1 << 21; + static const int CPU_FEATURE_AVX512VL = 1 << 22; + static const int CPU_FEATURE_AVX512IFMA = 1 << 23; + static const int CPU_FEATURE_AVX512VBMI = 1 << 24; + static const int CPU_FEATURE_XMM_ENABLED = 1 << 25; + static const int CPU_FEATURE_YMM_ENABLED = 1 << 26; + static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27; + static const int CPU_FEATURE_NEON = 1 << 28; + static const int CPU_FEATURE_NEON_2X = 1 << 29; + + /*! get CPU features */ + int getCPUFeatures(); + + /*! convert CPU features into a string */ + std::string stringOfCPUFeatures(int features); + + /*! creates a string of all supported targets that are supported */ + std::string supportedTargetList (int isa); + + /*! ISAs */ + static const int SSE = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED; + static const int SSE2 = SSE | CPU_FEATURE_SSE2; + static const int SSE3 = SSE2 | CPU_FEATURE_SSE3; + static const int SSSE3 = SSE3 | CPU_FEATURE_SSSE3; + static const int SSE41 = SSSE3 | CPU_FEATURE_SSE41; + static const int SSE42 = SSE41 | CPU_FEATURE_SSE42 | CPU_FEATURE_POPCNT; + static const int AVX = SSE42 | CPU_FEATURE_AVX | CPU_FEATURE_YMM_ENABLED; + static const int AVXI = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND; + static const int AVX2 = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT; + static const int AVX512KNL = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512PF | CPU_FEATURE_AVX512ER | CPU_FEATURE_AVX512CD | CPU_FEATURE_ZMM_ENABLED; + static const int AVX512SKX = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED; + static const int NEON = CPU_FEATURE_NEON | CPU_FEATURE_SSE | CPU_FEATURE_SSE2; + static const int NEON_2X = CPU_FEATURE_NEON_2X | AVX2; + + /*! converts ISA bitvector into a string */ + std::string stringOfISA(int features); + + /*! return the number of logical threads of the system */ + unsigned int getNumberOfLogicalThreads(); + + /*! returns the size of the terminal window in characters */ + int getTerminalWidth(); + + /*! returns performance counter in seconds */ + double getSeconds(); + + /*! sleeps the specified number of seconds */ + void sleepSeconds(double t); + + /*! returns virtual address space occupied by process */ + size_t getVirtualMemoryBytes(); + + /*! returns resident memory required by process */ + size_t getResidentMemoryBytes(); +} diff --git a/thirdparty/embree-aarch64/common/sys/thread.cpp b/thirdparty/embree-aarch64/common/sys/thread.cpp new file mode 100644 index 00000000000..f9ea5b7d966 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/thread.cpp @@ -0,0 +1,429 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "thread.h" +#include "sysinfo.h" +#include "string.h" + +#include +#if defined(__ARM_NEON) +#include "../math/SSE2NEON.h" +#else +#include +#endif + +#if defined(PTHREADS_WIN32) +#pragma comment (lib, "pthreadVC.lib") +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Windows Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__WIN32__) + +#define WIN32_LEAN_AND_MEAN +#include + +namespace embree +{ + /*! set the affinity of a given thread */ + void setAffinity(HANDLE thread, ssize_t affinity) + { + typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)(); + typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD); + typedef BOOL (WINAPI *SetThreadGroupAffinityFunc)(HANDLE, const GROUP_AFFINITY *, PGROUP_AFFINITY); + typedef BOOL (WINAPI *SetThreadIdealProcessorExFunc)(HANDLE, PPROCESSOR_NUMBER, PPROCESSOR_NUMBER); + HMODULE hlib = LoadLibrary("Kernel32"); + GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount"); + GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount"); + SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity"); + SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx"); + if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx) + { + int groups = pGetActiveProcessorGroupCount(); + int totalProcessors = 0, group = 0, number = 0; + for (int i = 0; i affinity) { + group = i; + number = (int)affinity - totalProcessors; + break; + } + totalProcessors += processors; + } + + GROUP_AFFINITY groupAffinity; + groupAffinity.Group = (WORD)group; + groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number); + groupAffinity.Reserved[0] = 0; + groupAffinity.Reserved[1] = 0; + groupAffinity.Reserved[2] = 0; + if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr)) + WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning + + PROCESSOR_NUMBER processorNumber; + processorNumber.Group = group; + processorNumber.Number = number; + processorNumber.Reserved = 0; + if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr)) + WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning + } + else + { + if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity))) + WARNING("SetThreadAffinityMask failed"); // on purpose only a warning + if (SetThreadIdealProcessor(thread, (DWORD)affinity) == (DWORD)-1) + WARNING("SetThreadIdealProcessor failed"); // on purpose only a warning + } + } + + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) { + setAffinity(GetCurrentThread(), affinity); + } + + struct ThreadStartupData + { + public: + ThreadStartupData (thread_func f, void* arg) + : f(f), arg(arg) {} + public: + thread_func f; + void* arg; + }; + + DWORD WINAPI threadStartup(LPVOID ptr) + { + ThreadStartupData* parg = (ThreadStartupData*) ptr; + _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); + parg->f(parg->arg); + delete parg; + parg = nullptr; + return 0; + } + +#if !defined(PTHREADS_WIN32) + + /*! creates a hardware thread running on specific core */ + thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID) + { + HANDLE thread = CreateThread(nullptr, stack_size, threadStartup, new ThreadStartupData(f,arg), 0, nullptr); + if (thread == nullptr) FATAL("CreateThread failed"); + if (threadID >= 0) setAffinity(thread, threadID); + return thread_t(thread); + } + + /*! the thread calling this function gets yielded */ + void yield() { + SwitchToThread(); + } + + /*! waits until the given thread has terminated */ + void join(thread_t tid) { + WaitForSingleObject(HANDLE(tid), INFINITE); + CloseHandle(HANDLE(tid)); + } + + /*! creates thread local storage */ + tls_t createTls() { + return tls_t(size_t(TlsAlloc())); + } + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr) { + TlsSetValue(DWORD(size_t(tls)), ptr); + } + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls) { + return TlsGetValue(DWORD(size_t(tls))); + } + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls) { + TlsFree(DWORD(size_t(tls))); + } +#endif +} + +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Linux Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__LINUX__) + +#include +#include +#include + +#if defined(__ANDROID__) +#include +#endif + +namespace embree +{ + static MutexSys mutex; + static std::vector threadIDs; + +#if !defined(__ANDROID__) // TODO(LTE): Implement for Android target + /* changes thread ID mapping such that we first fill up all thread on one core */ + size_t mapThreadID(size_t threadID) + { + Lock lock(mutex); + + if (threadIDs.size() == 0) + { + /* parse thread/CPU topology */ + for (size_t cpuID=0;;cpuID++) + { + std::fstream fs; + std::string cpu = std::string("/sys/devices/system/cpu/cpu") + std::to_string((long long)cpuID) + std::string("/topology/thread_siblings_list"); + fs.open (cpu.c_str(), std::fstream::in); + if (fs.fail()) break; + + int i; + while (fs >> i) + { + if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; })) + threadIDs.push_back(i); + if (fs.peek() == ',') + fs.ignore(); + } + fs.close(); + } + +#if 0 + for (size_t i=0;i " << threadIDs[i] << std::endl; +#endif + + /* verify the mapping and do not use it if the mapping has errors */ + for (size_t i=0;i + +namespace embree +{ + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { + cpuset_t cset; + CPU_ZERO(&cset); + CPU_SET(affinity, &cset); + + pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset); + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// MacOSX Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__MACOSX__) + +#include +#include +#include + +namespace embree +{ + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { + thread_affinity_policy ap; + ap.affinity_tag = affinity; + if (thread_policy_set(mach_thread_self(),THREAD_AFFINITY_POLICY,(thread_policy_t)&ap,THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS) + WARNING("setting thread affinity failed"); // on purpose only a warning + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +/// Unix Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__UNIX__) || defined(PTHREADS_WIN32) + +#include +#include + +#if defined(__USE_NUMA__) +#include +#endif + +namespace embree +{ + struct ThreadStartupData + { + public: + ThreadStartupData (thread_func f, void* arg, int affinity) + : f(f), arg(arg), affinity(affinity) {} + public: + thread_func f; + void* arg; + ssize_t affinity; + }; + + static void* threadStartup(ThreadStartupData* parg) + { + _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); + + /*! Mac OS X does not support setting affinity at thread creation time */ +#if defined(__MACOSX__) + if (parg->affinity >= 0) + setAffinity(parg->affinity); +#endif + + parg->f(parg->arg); + delete parg; + parg = nullptr; + return nullptr; + } + + /*! creates a hardware thread running on specific core */ + thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID) + { + /* set stack size */ + pthread_attr_t attr; + pthread_attr_init(&attr); + if (stack_size > 0) pthread_attr_setstacksize (&attr, stack_size); + + /* create thread */ + pthread_t* tid = new pthread_t; + if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) { + pthread_attr_destroy(&attr); + delete tid; + FATAL("pthread_create failed"); + } + pthread_attr_destroy(&attr); + + /* set affinity */ +#if defined(__LINUX__) && !defined(__ANDROID__) + if (threadID >= 0) { + cpu_set_t cset; + CPU_ZERO(&cset); + threadID = mapThreadID(threadID); + CPU_SET(threadID, &cset); + pthread_setaffinity_np(*tid, sizeof(cset), &cset); + } +#elif defined(__FreeBSD__) + if (threadID >= 0) { + cpuset_t cset; + CPU_ZERO(&cset); + CPU_SET(threadID, &cset); + pthread_setaffinity_np(*tid, sizeof(cset), &cset); + } +#endif + + return thread_t(tid); + } + + /*! the thread calling this function gets yielded */ + void yield() { + sched_yield(); + } + + /*! waits until the given thread has terminated */ + void join(thread_t tid) { + if (pthread_join(*(pthread_t*)tid, nullptr) != 0) + FATAL("pthread_join failed"); + delete (pthread_t*)tid; + } + + /*! creates thread local storage */ + tls_t createTls() + { + pthread_key_t* key = new pthread_key_t; + if (pthread_key_create(key,nullptr) != 0) { + delete key; + FATAL("pthread_key_create failed"); + } + + return tls_t(key); + } + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls) + { + assert(tls); + return pthread_getspecific(*(pthread_key_t*)tls); + } + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr) + { + assert(tls); + if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0) + FATAL("pthread_setspecific failed"); + } + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls) + { + assert(tls); + if (pthread_key_delete(*(pthread_key_t*)tls) != 0) + FATAL("pthread_key_delete failed"); + delete (pthread_key_t*)tls; + } +} + +#endif diff --git a/thirdparty/embree-aarch64/common/sys/thread.h b/thirdparty/embree-aarch64/common/sys/thread.h new file mode 100644 index 00000000000..45da6e6a704 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/thread.h @@ -0,0 +1,46 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "platform.h" +#include "mutex.h" +#include "alloc.h" +#include "vector.h" +#include + +namespace embree +{ + /*! type for thread */ + typedef struct opaque_thread_t* thread_t; + + /*! signature of thread start function */ + typedef void (*thread_func)(void*); + + /*! creates a hardware thread running on specific logical thread */ + thread_t createThread(thread_func f, void* arg, size_t stack_size = 0, ssize_t threadID = -1); + + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity); + + /*! the thread calling this function gets yielded */ + void yield(); + + /*! waits until the given thread has terminated */ + void join(thread_t tid); + + /*! type for handle to thread local storage */ + typedef struct opaque_tls_t* tls_t; + + /*! creates thread local storage */ + tls_t createTls(); + + /*! set the thread local storage pointer */ + void setTls(tls_t tls, void* const ptr); + + /*! return the thread local storage pointer */ + void* getTls(tls_t tls); + + /*! destroys thread local storage identifier */ + void destroyTls(tls_t tls); +} diff --git a/thirdparty/embree-aarch64/common/sys/vector.h b/thirdparty/embree-aarch64/common/sys/vector.h new file mode 100644 index 00000000000..e41794de7c5 --- /dev/null +++ b/thirdparty/embree-aarch64/common/sys/vector.h @@ -0,0 +1,242 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "alloc.h" +#include + +namespace embree +{ + template + class vector_t + { + public: + typedef T value_type; + typedef T* iterator; + typedef const T* const_iterator; + + __forceinline vector_t () + : size_active(0), size_alloced(0), items(nullptr) {} + + __forceinline explicit vector_t (size_t sz) + : size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } + + template + __forceinline explicit vector_t (M alloc, size_t sz) + : alloc(alloc), size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); } + + __forceinline ~vector_t() { + clear(); + } + + __forceinline vector_t (const vector_t& other) + { + size_active = other.size_active; + size_alloced = other.size_alloced; + items = alloc.allocate(size_alloced); + for (size_t i=0; i 0); return items[0]; }; + __forceinline T& back () const { assert(size_active > 0); return items[size_active-1]; }; + + __forceinline T* data() { return items; }; + __forceinline const T* data() const { return items; }; + + + /******************** Modifiers **************************/ + + __forceinline void push_back(const T& nt) + { + const T v = nt; // need local copy as input reference could point to this vector + internal_resize(size_active,internal_grow_size(size_active+1)); + ::new (&items[size_active++]) T(v); + } + + __forceinline void pop_back() + { + assert(!empty()); + size_active--; + alloc.destroy(&items[size_active]); + } + + __forceinline void clear() + { + /* destroy elements */ + for (size_t i=0; i + using vector = vector_t>; + + /*! vector class that performs aligned allocations */ + template + using avector = vector_t::value> >; + + /*! vector class that performs OS allocations */ + template + using ovector = vector_t >; +} diff --git a/thirdparty/embree-aarch64/common/tasking/taskscheduler.h b/thirdparty/embree-aarch64/common/tasking/taskscheduler.h new file mode 100644 index 00000000000..9940e068d06 --- /dev/null +++ b/thirdparty/embree-aarch64/common/tasking/taskscheduler.h @@ -0,0 +1,17 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#if defined(TASKING_INTERNAL) +# include "taskschedulerinternal.h" +#elif defined(TASKING_GCD) && defined(BUILD_IOS) +# include "taskschedulergcd.h" +#elif defined(TASKING_TBB) +# include "taskschedulertbb.h" +#elif defined(TASKING_PPL) +# include "taskschedulerppl.h" +#else +# error "no tasking system enabled" +#endif + diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h b/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h new file mode 100644 index 00000000000..d31f8bb478f --- /dev/null +++ b/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h @@ -0,0 +1,49 @@ +#pragma once + +#include "../sys/platform.h" +#include "../sys/alloc.h" +#include "../sys/barrier.h" +#include "../sys/thread.h" +#include "../sys/mutex.h" +#include "../sys/condition.h" +#include "../sys/ref.h" + +#include + +namespace embree +{ + struct TaskScheduler + { + /*! initializes the task scheduler */ + static void create(size_t numThreads, bool set_affinity, bool start_threads); + + /*! destroys the task scheduler again */ + static void destroy() {} + + /* returns the ID of the current thread */ + static __forceinline size_t threadID() + { + return threadIndex(); + } + + /* returns the index (0..threadCount-1) of the current thread */ + static __forceinline size_t threadIndex() + { + currentThreadIndex = (currentThreadIndex + 1) % GCDNumThreads; + return currentThreadIndex; + } + + /* returns the total number of threads */ + static __forceinline size_t threadCount() + { + return GCDNumThreads; + } + + private: + static size_t GCDNumThreads; + static size_t currentThreadIndex; + + }; + +}; + diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp new file mode 100644 index 00000000000..ebf656d1a0c --- /dev/null +++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp @@ -0,0 +1,426 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "taskschedulerinternal.h" +#include "../math/math.h" +#include "../sys/sysinfo.h" +#include + +namespace embree +{ + RTC_NAMESPACE_BEGIN + + static MutexSys g_mutex; + size_t TaskScheduler::g_numThreads = 0; + __thread TaskScheduler* TaskScheduler::g_instance = nullptr; + std::vector> g_instance_vector; + __thread TaskScheduler::Thread* TaskScheduler::thread_local_thread = nullptr; + TaskScheduler::ThreadPool* TaskScheduler::threadPool = nullptr; + + template + __forceinline void TaskScheduler::steal_loop(Thread& thread, const Predicate& pred, const Body& body) + { + while (true) + { + /*! some rounds that yield */ + for (size_t i=0; i<32; i++) + { + /*! some spinning rounds */ + const size_t threadCount = thread.threadCount(); + for (size_t j=0; j<1024; j+=threadCount) + { + if (!pred()) return; + if (thread.scheduler->steal_from_other_threads(thread)) { + i=j=0; + body(); + } + } + yield(); + } + } + } + + /*! run this task */ + void TaskScheduler::Task::run_internal (Thread& thread) // FIXME: avoid as many dll_exports as possible + { + /* try to run if not already stolen */ + if (try_switch_state(INITIALIZED,DONE)) + { + Task* prevTask = thread.task; + thread.task = this; + // -- GODOT start -- + // try { + // if (thread.scheduler->cancellingException == nullptr) + closure->execute(); + // } catch (...) { + // if (thread.scheduler->cancellingException == nullptr) + // thread.scheduler->cancellingException = std::current_exception(); + // } + // -- GODOT end -- + thread.task = prevTask; + add_dependencies(-1); + } + + /* steal until all dependencies have completed */ + steal_loop(thread, + [&] () { return dependencies>0; }, + [&] () { while (thread.tasks.execute_local_internal(thread,this)); }); + + /* now signal our parent task that we are finished */ + if (parent) + parent->add_dependencies(-1); + } + + /*! run this task */ + dll_export void TaskScheduler::Task::run (Thread& thread) { + run_internal(thread); + } + + bool TaskScheduler::TaskQueue::execute_local_internal(Thread& thread, Task* parent) + { + /* stop if we run out of local tasks or reach the waiting task */ + if (right == 0 || &tasks[right-1] == parent) + return false; + + /* execute task */ + size_t oldRight = right; + tasks[right-1].run_internal(thread); + if (right != oldRight) { + THROW_RUNTIME_ERROR("you have to wait for spawned subtasks"); + } + + /* pop task and closure from stack */ + right--; + if (tasks[right].stackPtr != size_t(-1)) + stackPtr = tasks[right].stackPtr; + + /* also move left pointer */ + if (left >= right) left.store(right.load()); + + return right != 0; + } + + dll_export bool TaskScheduler::TaskQueue::execute_local(Thread& thread, Task* parent) { + return execute_local_internal(thread,parent); + } + + bool TaskScheduler::TaskQueue::steal(Thread& thread) + { + size_t l = left; + size_t r = right; + if (l < r) + { + l = left++; + if (l >= r) + return false; + } + else + return false; + + if (!tasks[l].try_steal(thread.tasks.tasks[thread.tasks.right])) + return false; + + thread.tasks.right++; + return true; + } + + /* we steal from the left */ + size_t TaskScheduler::TaskQueue::getTaskSizeAtLeft() + { + if (left >= right) return 0; + return tasks[left].N; + } + + void threadPoolFunction(std::pair* pair) + { + TaskScheduler::ThreadPool* pool = pair->first; + size_t threadIndex = pair->second; + delete pair; + pool->thread_loop(threadIndex); + } + + TaskScheduler::ThreadPool::ThreadPool(bool set_affinity) + : numThreads(0), numThreadsRunning(0), set_affinity(set_affinity), running(false) {} + + dll_export void TaskScheduler::ThreadPool::startThreads() + { + if (running) return; + setNumThreads(numThreads,true); + } + + void TaskScheduler::ThreadPool::setNumThreads(size_t newNumThreads, bool startThreads) + { + Lock lock(g_mutex); + assert(newNumThreads); + newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads()); + + // We are observing a few % gain by increasing number threads by 2 on aarch64. +#if defined(__aarch64__) && defined(BUILD_IOS) + numThreads = newNumThreads*2; +#else + numThreads = newNumThreads; +#endif + numThreads = newNumThreads; + if (!startThreads && !running) return; + running = true; + size_t numThreadsActive = numThreadsRunning; + + mutex.lock(); + numThreadsRunning = newNumThreads; + mutex.unlock(); + condition.notify_all(); + + /* start new threads */ + for (size_t t=numThreadsActive; t(this,t); + threads.push_back(createThread((thread_func)threadPoolFunction,pair,4*1024*1024,set_affinity ? t : -1)); + } + + /* stop some threads if we reduce the number of threads */ + for (ssize_t t=numThreadsActive-1; t>=ssize_t(numThreadsRunning); t--) { + if (t == 0) continue; + embree::join(threads.back()); + threads.pop_back(); + } + } + + TaskScheduler::ThreadPool::~ThreadPool() + { + /* leave all taskschedulers */ + mutex.lock(); + numThreadsRunning = 0; + mutex.unlock(); + condition.notify_all(); + + /* wait for threads to terminate */ + for (size_t i=0; i& scheduler) + { + mutex.lock(); + schedulers.push_back(scheduler); + mutex.unlock(); + condition.notify_all(); + } + + dll_export void TaskScheduler::ThreadPool::remove(const Ref& scheduler) + { + Lock lock(mutex); + for (std::list >::iterator it = schedulers.begin(); it != schedulers.end(); it++) { + if (scheduler == *it) { + schedulers.erase(it); + return; + } + } + } + + void TaskScheduler::ThreadPool::thread_loop(size_t globalThreadIndex) + { + while (globalThreadIndex < numThreadsRunning) + { + Ref scheduler = NULL; + ssize_t threadIndex = -1; + { + Lock lock(mutex); + condition.wait(mutex, [&] () { return globalThreadIndex >= numThreadsRunning || !schedulers.empty(); }); + if (globalThreadIndex >= numThreadsRunning) break; + scheduler = schedulers.front(); + threadIndex = scheduler->allocThreadIndex(); + } + scheduler->thread_loop(threadIndex); + } + } + + TaskScheduler::TaskScheduler() + : threadCounter(0), anyTasksRunning(0), hasRootTask(false) + { + threadLocal.resize(2*getNumberOfLogicalThreads()); // FIXME: this has to be 2x as in the compatibility join mode with rtcCommitScene the worker threads also join. When disallowing rtcCommitScene to join a build we can remove the 2x. + for (size_t i=0; ithreadIndex; + else return 0; + } + + dll_export size_t TaskScheduler::threadIndex() + { + Thread* thread = TaskScheduler::thread(); + if (thread) return thread->threadIndex; + else return 0; + } + + dll_export size_t TaskScheduler::threadCount() { + return threadPool->size(); + } + + dll_export TaskScheduler* TaskScheduler::instance() + { + if (g_instance == NULL) { + Lock lock(g_mutex); + g_instance = new TaskScheduler; + g_instance_vector.push_back(g_instance); + } + return g_instance; + } + + void TaskScheduler::create(size_t numThreads, bool set_affinity, bool start_threads) + { + if (!threadPool) threadPool = new TaskScheduler::ThreadPool(set_affinity); + threadPool->setNumThreads(numThreads,start_threads); + } + + void TaskScheduler::destroy() { + delete threadPool; threadPool = nullptr; + } + + dll_export ssize_t TaskScheduler::allocThreadIndex() + { + size_t threadIndex = threadCounter++; + assert(threadIndex < threadLocal.size()); + return threadIndex; + } + + void TaskScheduler::join() + { + mutex.lock(); + size_t threadIndex = allocThreadIndex(); + condition.wait(mutex, [&] () { return hasRootTask.load(); }); + mutex.unlock(); + // -- GODOT start -- + // std::exception_ptr except = thread_loop(threadIndex); + // if (except != nullptr) std::rethrow_exception(except); + thread_loop(threadIndex); + // -- GODOT end -- + } + + void TaskScheduler::reset() { + hasRootTask = false; + } + + void TaskScheduler::wait_for_threads(size_t threadCount) + { + while (threadCounter < threadCount-1) + pause_cpu(); + } + + dll_export TaskScheduler::Thread* TaskScheduler::thread() { + return thread_local_thread; + } + + dll_export TaskScheduler::Thread* TaskScheduler::swapThread(Thread* thread) + { + Thread* old = thread_local_thread; + thread_local_thread = thread; + return old; + } + + dll_export bool TaskScheduler::wait() + { + Thread* thread = TaskScheduler::thread(); + if (thread == nullptr) return true; + while (thread->tasks.execute_local_internal(*thread,thread->task)) {}; + return thread->scheduler->cancellingException == nullptr; + } + +// -- GODOT start -- +// std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex) + void TaskScheduler::thread_loop(size_t threadIndex) +// -- GODOT end -- + { + /* allocate thread structure */ + std::unique_ptr mthread(new Thread(threadIndex,this)); // too large for stack allocation + Thread& thread = *mthread; + threadLocal[threadIndex].store(&thread); + Thread* oldThread = swapThread(&thread); + + /* main thread loop */ + while (anyTasksRunning) + { + steal_loop(thread, + [&] () { return anyTasksRunning > 0; }, + [&] () { + anyTasksRunning++; + while (thread.tasks.execute_local_internal(thread,nullptr)); + anyTasksRunning--; + }); + } + threadLocal[threadIndex].store(nullptr); + swapThread(oldThread); + + /* remember exception to throw */ + // -- GODOT start -- + // std::exception_ptr except = nullptr; + // if (cancellingException != nullptr) except = cancellingException; + // -- GODOT end -- + /* wait for all threads to terminate */ + threadCounter--; +#if defined(__WIN32__) + size_t loopIndex = 1; +#endif +#define LOOP_YIELD_THRESHOLD (4096) + while (threadCounter > 0) { +#if defined(__WIN32__) + if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0) + yield(); + else + _mm_pause(); + loopIndex++; +#else + yield(); +#endif + } + // -- GODOT start -- + // return except; + return; + // -- GODOT end -- + } + + bool TaskScheduler::steal_from_other_threads(Thread& thread) + { + const size_t threadIndex = thread.threadIndex; + const size_t threadCount = this->threadCounter; + + for (size_t i=1; i= threadCount) otherThreadIndex -= threadCount; + + Thread* othread = threadLocal[otherThreadIndex].load(); + if (!othread) + continue; + + if (othread->tasks.steal(thread)) + return true; + } + + return false; + } + + dll_export void TaskScheduler::startThreads() { + threadPool->startThreads(); + } + + dll_export void TaskScheduler::addScheduler(const Ref& scheduler) { + threadPool->add(scheduler); + } + + dll_export void TaskScheduler::removeScheduler(const Ref& scheduler) { + threadPool->remove(scheduler); + } + + RTC_NAMESPACE_END +} diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h new file mode 100644 index 00000000000..8bd70b2b8cf --- /dev/null +++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h @@ -0,0 +1,386 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/alloc.h" +#include "../sys/barrier.h" +#include "../sys/thread.h" +#include "../sys/mutex.h" +#include "../sys/condition.h" +#include "../sys/ref.h" +#include "../sys/atomic.h" +#include "../math/range.h" +#include "../../include/embree3/rtcore.h" + +#include + +namespace embree +{ + + /* The tasking system exports some symbols to be used by the tutorials. Thus we + hide is also in the API namespace when requested. */ + RTC_NAMESPACE_BEGIN + + struct TaskScheduler : public RefCount + { + ALIGNED_STRUCT_(64); + friend class Device; + + static const size_t TASK_STACK_SIZE = 4*1024; //!< task structure stack + static const size_t CLOSURE_STACK_SIZE = 512*1024; //!< stack for task closures + + struct Thread; + + /*! virtual interface for all tasks */ + struct TaskFunction { + virtual void execute() = 0; + }; + + /*! builds a task interface from a closure */ + template + struct ClosureTaskFunction : public TaskFunction + { + Closure closure; + __forceinline ClosureTaskFunction (const Closure& closure) : closure(closure) {} + void execute() { closure(); }; + }; + + struct __aligned(64) Task + { + /*! states a task can be in */ + enum { DONE, INITIALIZED }; + + /*! switch from one state to another */ + __forceinline void switch_state(int from, int to) + { + __memory_barrier(); + MAYBE_UNUSED bool success = state.compare_exchange_strong(from,to); + assert(success); + } + + /*! try to switch from one state to another */ + __forceinline bool try_switch_state(int from, int to) { + __memory_barrier(); + return state.compare_exchange_strong(from,to); + } + + /*! increment/decrement dependency counter */ + void add_dependencies(int n) { + dependencies+=n; + } + + /*! initialize all tasks to DONE state by default */ + __forceinline Task() + : state(DONE) {} + + /*! construction of new task */ + __forceinline Task (TaskFunction* closure, Task* parent, size_t stackPtr, size_t N) + : dependencies(1), stealable(true), closure(closure), parent(parent), stackPtr(stackPtr), N(N) + { + if (parent) parent->add_dependencies(+1); + switch_state(DONE,INITIALIZED); + } + + /*! construction of stolen task, stealing thread will decrement initial dependency */ + __forceinline Task (TaskFunction* closure, Task* parent) + : dependencies(1), stealable(false), closure(closure), parent(parent), stackPtr(-1), N(1) + { + switch_state(DONE,INITIALIZED); + } + + /*! try to steal this task */ + bool try_steal(Task& child) + { + if (!stealable) return false; + if (!try_switch_state(INITIALIZED,DONE)) return false; + new (&child) Task(closure, this); + return true; + } + + /*! run this task */ + dll_export void run(Thread& thread); + + void run_internal(Thread& thread); + + public: + std::atomic state; //!< state this task is in + std::atomic dependencies; //!< dependencies to wait for + std::atomic stealable; //!< true if task can be stolen + TaskFunction* closure; //!< the closure to execute + Task* parent; //!< parent task to signal when we are finished + size_t stackPtr; //!< stack location where closure is stored + size_t N; //!< approximative size of task + }; + + struct TaskQueue + { + TaskQueue () + : left(0), right(0), stackPtr(0) {} + + __forceinline void* alloc(size_t bytes, size_t align = 64) + { + size_t ofs = bytes + ((align - stackPtr) & (align-1)); + if (stackPtr + ofs > CLOSURE_STACK_SIZE) + // -- GODOT start -- + // throw std::runtime_error("closure stack overflow"); + abort(); + // -- GODOT end -- + stackPtr += ofs; + return &stack[stackPtr-bytes]; + } + + template + __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure) + { + if (right >= TASK_STACK_SIZE) + // -- GODOT start -- + // throw std::runtime_error("task stack overflow"); + abort(); + // -- GODOT end -- + + /* allocate new task on right side of stack */ + size_t oldStackPtr = stackPtr; + TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction))) ClosureTaskFunction(closure); + /* gcc 8 or later fails to compile without explicit .load() */ + new (&(tasks[right.load()])) Task(func,thread.task,oldStackPtr,size); + right++; + + /* also move left pointer */ + if (left >= right-1) left = right-1; + } + + dll_export bool execute_local(Thread& thread, Task* parent); + bool execute_local_internal(Thread& thread, Task* parent); + bool steal(Thread& thread); + size_t getTaskSizeAtLeft(); + + bool empty() { return right == 0; } + + public: + + /* task stack */ + Task tasks[TASK_STACK_SIZE]; + __aligned(64) std::atomic left; //!< threads steal from left + __aligned(64) std::atomic right; //!< new tasks are added to the right + + /* closure stack */ + __aligned(64) char stack[CLOSURE_STACK_SIZE]; + size_t stackPtr; + }; + + /*! thread local structure for each thread */ + struct Thread + { + ALIGNED_STRUCT_(64); + + Thread (size_t threadIndex, const Ref& scheduler) + : threadIndex(threadIndex), task(nullptr), scheduler(scheduler) {} + + __forceinline size_t threadCount() { + return scheduler->threadCounter; + } + + size_t threadIndex; //!< ID of this thread + TaskQueue tasks; //!< local task queue + Task* task; //!< current active task + Ref scheduler; //!< pointer to task scheduler + }; + + /*! pool of worker threads */ + struct ThreadPool + { + ThreadPool (bool set_affinity); + ~ThreadPool (); + + /*! starts the threads */ + dll_export void startThreads(); + + /*! sets number of threads to use */ + void setNumThreads(size_t numThreads, bool startThreads = false); + + /*! adds a task scheduler object for scheduling */ + dll_export void add(const Ref& scheduler); + + /*! remove the task scheduler object again */ + dll_export void remove(const Ref& scheduler); + + /*! returns number of threads of the thread pool */ + size_t size() const { return numThreads; } + + /*! main loop for all threads */ + void thread_loop(size_t threadIndex); + + private: + std::atomic numThreads; + std::atomic numThreadsRunning; + bool set_affinity; + std::atomic running; + std::vector threads; + + private: + MutexSys mutex; + ConditionSys condition; + std::list > schedulers; + }; + + TaskScheduler (); + ~TaskScheduler (); + + /*! initializes the task scheduler */ + static void create(size_t numThreads, bool set_affinity, bool start_threads); + + /*! destroys the task scheduler again */ + static void destroy(); + + /*! lets new worker threads join the tasking system */ + void join(); + void reset(); + + /*! let a worker thread allocate a thread index */ + dll_export ssize_t allocThreadIndex(); + + /*! wait for some number of threads available (threadCount includes main thread) */ + void wait_for_threads(size_t threadCount); + + /*! thread loop for all worker threads */ + // -- GODOT start -- + // std::exception_ptr thread_loop(size_t threadIndex); + void thread_loop(size_t threadIndex); + // -- GODOT end -- + + /*! steals a task from a different thread */ + bool steal_from_other_threads(Thread& thread); + + template + static void steal_loop(Thread& thread, const Predicate& pred, const Body& body); + + /* spawn a new task at the top of the threads task stack */ + template + void spawn_root(const Closure& closure, size_t size = 1, bool useThreadPool = true) + { + if (useThreadPool) startThreads(); + + size_t threadIndex = allocThreadIndex(); + std::unique_ptr mthread(new Thread(threadIndex,this)); // too large for stack allocation + Thread& thread = *mthread; + assert(threadLocal[threadIndex].load() == nullptr); + threadLocal[threadIndex] = &thread; + Thread* oldThread = swapThread(&thread); + thread.tasks.push_right(thread,size,closure); + { + Lock lock(mutex); + anyTasksRunning++; + hasRootTask = true; + condition.notify_all(); + } + + if (useThreadPool) addScheduler(this); + + while (thread.tasks.execute_local(thread,nullptr)); + anyTasksRunning--; + if (useThreadPool) removeScheduler(this); + + threadLocal[threadIndex] = nullptr; + swapThread(oldThread); + + /* remember exception to throw */ + std::exception_ptr except = nullptr; + if (cancellingException != nullptr) except = cancellingException; + + /* wait for all threads to terminate */ + threadCounter--; + while (threadCounter > 0) yield(); + cancellingException = nullptr; + + /* re-throw proper exception */ + if (except != nullptr) + std::rethrow_exception(except); + } + + /* spawn a new task at the top of the threads task stack */ + template + static __forceinline void spawn(size_t size, const Closure& closure) + { + Thread* thread = TaskScheduler::thread(); + if (likely(thread != nullptr)) thread->tasks.push_right(*thread,size,closure); + else instance()->spawn_root(closure,size); + } + + /* spawn a new task at the top of the threads task stack */ + template + static __forceinline void spawn(const Closure& closure) { + spawn(1,closure); + } + + /* spawn a new task set */ + template + static void spawn(const Index begin, const Index end, const Index blockSize, const Closure& closure) + { + spawn(end-begin, [=]() + { + if (end-begin <= blockSize) { + return closure(range(begin,end)); + } + const Index center = (begin+end)/2; + spawn(begin,center,blockSize,closure); + spawn(center,end ,blockSize,closure); + wait(); + }); + } + + /* work on spawned subtasks and wait until all have finished */ + dll_export static bool wait(); + + /* returns the ID of the current thread */ + dll_export static size_t threadID(); + + /* returns the index (0..threadCount-1) of the current thread */ + dll_export static size_t threadIndex(); + + /* returns the total number of threads */ + dll_export static size_t threadCount(); + + private: + + /* returns the thread local task list of this worker thread */ + dll_export static Thread* thread(); + + /* sets the thread local task list of this worker thread */ + dll_export static Thread* swapThread(Thread* thread); + + /*! returns the taskscheduler object to be used by the master thread */ + dll_export static TaskScheduler* instance(); + + /*! starts the threads */ + dll_export static void startThreads(); + + /*! adds a task scheduler object for scheduling */ + dll_export static void addScheduler(const Ref& scheduler); + + /*! remove the task scheduler object again */ + dll_export static void removeScheduler(const Ref& scheduler); + + private: + std::vector> threadLocal; + std::atomic threadCounter; + std::atomic anyTasksRunning; + std::atomic hasRootTask; + std::exception_ptr cancellingException; + MutexSys mutex; + ConditionSys condition; + + private: + static size_t g_numThreads; + static __thread TaskScheduler* g_instance; + static __thread Thread* thread_local_thread; + static ThreadPool* threadPool; + }; + + RTC_NAMESPACE_END + +#if defined(RTC_NAMESPACE) + using RTC_NAMESPACE::TaskScheduler; +#endif +} diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h new file mode 100644 index 00000000000..776f98cdacd --- /dev/null +++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h @@ -0,0 +1,46 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/alloc.h" +#include "../sys/barrier.h" +#include "../sys/thread.h" +#include "../sys/mutex.h" +#include "../sys/condition.h" +#include "../sys/ref.h" + +#if !defined(__WIN32__) +#error PPL tasking system only available under windows +#endif + +#include + +namespace embree +{ + struct TaskScheduler + { + /*! initializes the task scheduler */ + static void create(size_t numThreads, bool set_affinity, bool start_threads); + + /*! destroys the task scheduler again */ + static void destroy(); + + /* returns the ID of the current thread */ + static __forceinline size_t threadID() { + return GetCurrentThreadId(); + } + + /* returns the index (0..threadCount-1) of the current thread */ + /* FIXME: threadIndex is NOT supported by PPL! */ + static __forceinline size_t threadIndex() { + return 0; + } + + /* returns the total number of threads */ + static __forceinline size_t threadCount() { + return GetMaximumProcessorCount(ALL_PROCESSOR_GROUPS) + 1; + } + }; +}; diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h b/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h new file mode 100644 index 00000000000..98dba268719 --- /dev/null +++ b/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h @@ -0,0 +1,67 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../sys/platform.h" +#include "../sys/alloc.h" +#include "../sys/barrier.h" +#include "../sys/thread.h" +#include "../sys/mutex.h" +#include "../sys/condition.h" +#include "../sys/ref.h" + +#if defined(__WIN32__) +# define NOMINMAX +#endif + +// We need to define these to avoid implicit linkage against +// tbb_debug.lib under Windows. When removing these lines debug build +// under Windows fails. +#define __TBB_NO_IMPLICIT_LINKAGE 1 +#define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 +#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1 +#define TBB_PREVIEW_ISOLATED_TASK_GROUP 1 +#include "tbb/tbb.h" +#include "tbb/parallel_sort.h" + +namespace embree +{ + struct TaskScheduler + { + /*! initializes the task scheduler */ + static void create(size_t numThreads, bool set_affinity, bool start_threads); + + /*! destroys the task scheduler again */ + static void destroy(); + + /* returns the ID of the current thread */ + static __forceinline size_t threadID() + { + return threadIndex(); + } + + /* returns the index (0..threadCount-1) of the current thread */ + static __forceinline size_t threadIndex() + { +#if TBB_INTERFACE_VERSION >= 9100 + return tbb::this_task_arena::current_thread_index(); +#elif TBB_INTERFACE_VERSION >= 9000 + return tbb::task_arena::current_thread_index(); +#else + return 0; +#endif + } + + /* returns the total number of threads */ + static __forceinline size_t threadCount() { +#if TBB_INTERFACE_VERSION >= 9100 + return tbb::this_task_arena::max_concurrency(); +#else + return tbb::task_scheduler_init::default_num_threads(); +#endif + } + + }; + +}; diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore.h b/thirdparty/embree-aarch64/include/embree3/rtcore.h new file mode 100644 index 00000000000..5830bb58801 --- /dev/null +++ b/thirdparty/embree-aarch64/include/embree3/rtcore.h @@ -0,0 +1,14 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_config.h" +#include "rtcore_common.h" +#include "rtcore_device.h" +#include "rtcore_buffer.h" +#include "rtcore_ray.h" +#include "rtcore_geometry.h" +#include "rtcore_scene.h" +#include "rtcore_builder.h" +#include "rtcore_quaternion.h" diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_buffer.h b/thirdparty/embree-aarch64/include/embree3/rtcore_buffer.h new file mode 100644 index 00000000000..400b604aa53 --- /dev/null +++ b/thirdparty/embree-aarch64/include/embree3/rtcore_buffer.h @@ -0,0 +1,51 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_device.h" + +RTC_NAMESPACE_BEGIN + +/* Types of buffers */ +enum RTCBufferType +{ + RTC_BUFFER_TYPE_INDEX = 0, + RTC_BUFFER_TYPE_VERTEX = 1, + RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE = 2, + RTC_BUFFER_TYPE_NORMAL = 3, + RTC_BUFFER_TYPE_TANGENT = 4, + RTC_BUFFER_TYPE_NORMAL_DERIVATIVE = 5, + + RTC_BUFFER_TYPE_GRID = 8, + + RTC_BUFFER_TYPE_FACE = 16, + RTC_BUFFER_TYPE_LEVEL = 17, + RTC_BUFFER_TYPE_EDGE_CREASE_INDEX = 18, + RTC_BUFFER_TYPE_EDGE_CREASE_WEIGHT = 19, + RTC_BUFFER_TYPE_VERTEX_CREASE_INDEX = 20, + RTC_BUFFER_TYPE_VERTEX_CREASE_WEIGHT = 21, + RTC_BUFFER_TYPE_HOLE = 22, + + RTC_BUFFER_TYPE_FLAGS = 32 +}; + +/* Opaque buffer type */ +typedef struct RTCBufferTy* RTCBuffer; + +/* Creates a new buffer. */ +RTC_API RTCBuffer rtcNewBuffer(RTCDevice device, size_t byteSize); + +/* Creates a new shared buffer. */ +RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice device, void* ptr, size_t byteSize); + +/* Returns a pointer to the buffer data. */ +RTC_API void* rtcGetBufferData(RTCBuffer buffer); + +/* Retains the buffer (increments the reference count). */ +RTC_API void rtcRetainBuffer(RTCBuffer buffer); + +/* Releases the buffer (decrements the reference count). */ +RTC_API void rtcReleaseBuffer(RTCBuffer buffer); + +RTC_NAMESPACE_END diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_builder.h b/thirdparty/embree-aarch64/include/embree3/rtcore_builder.h new file mode 100644 index 00000000000..d62a7f72cc2 --- /dev/null +++ b/thirdparty/embree-aarch64/include/embree3/rtcore_builder.h @@ -0,0 +1,125 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_scene.h" + +RTC_NAMESPACE_BEGIN + +/* Opaque BVH type */ +typedef struct RTCBVHTy* RTCBVH; + +/* Input build primitives for the builder */ +struct RTC_ALIGN(32) RTCBuildPrimitive +{ + float lower_x, lower_y, lower_z; + unsigned int geomID; + float upper_x, upper_y, upper_z; + unsigned int primID; +}; + +/* Opaque thread local allocator type */ +typedef struct RTCThreadLocalAllocatorTy* RTCThreadLocalAllocator; + +/* Callback to create a node */ +typedef void* (*RTCCreateNodeFunction) (RTCThreadLocalAllocator allocator, unsigned int childCount, void* userPtr); + +/* Callback to set the pointer to all children */ +typedef void (*RTCSetNodeChildrenFunction) (void* nodePtr, void** children, unsigned int childCount, void* userPtr); + +/* Callback to set the bounds of all children */ +typedef void (*RTCSetNodeBoundsFunction) (void* nodePtr, const struct RTCBounds** bounds, unsigned int childCount, void* userPtr); + +/* Callback to create a leaf node */ +typedef void* (*RTCCreateLeafFunction) (RTCThreadLocalAllocator allocator, const struct RTCBuildPrimitive* primitives, size_t primitiveCount, void* userPtr); + +/* Callback to split a build primitive */ +typedef void (*RTCSplitPrimitiveFunction) (const struct RTCBuildPrimitive* primitive, unsigned int dimension, float position, struct RTCBounds* leftBounds, struct RTCBounds* rightBounds, void* userPtr); + +/* Build flags */ +enum RTCBuildFlags +{ + RTC_BUILD_FLAG_NONE = 0, + RTC_BUILD_FLAG_DYNAMIC = (1 << 0), +}; + +enum RTCBuildConstants +{ + RTC_BUILD_MAX_PRIMITIVES_PER_LEAF = 32 +}; + +/* Input for builders */ +struct RTCBuildArguments +{ + size_t byteSize; + + enum RTCBuildQuality buildQuality; + enum RTCBuildFlags buildFlags; + unsigned int maxBranchingFactor; + unsigned int maxDepth; + unsigned int sahBlockSize; + unsigned int minLeafSize; + unsigned int maxLeafSize; + float traversalCost; + float intersectionCost; + + RTCBVH bvh; + struct RTCBuildPrimitive* primitives; + size_t primitiveCount; + size_t primitiveArrayCapacity; + + RTCCreateNodeFunction createNode; + RTCSetNodeChildrenFunction setNodeChildren; + RTCSetNodeBoundsFunction setNodeBounds; + RTCCreateLeafFunction createLeaf; + RTCSplitPrimitiveFunction splitPrimitive; + RTCProgressMonitorFunction buildProgress; + void* userPtr; +}; + +/* Returns the default build settings. */ +RTC_FORCEINLINE struct RTCBuildArguments rtcDefaultBuildArguments() +{ + struct RTCBuildArguments args; + args.byteSize = sizeof(args); + args.buildQuality = RTC_BUILD_QUALITY_MEDIUM; + args.buildFlags = RTC_BUILD_FLAG_NONE; + args.maxBranchingFactor = 2; + args.maxDepth = 32; + args.sahBlockSize = 1; + args.minLeafSize = 1; + args.maxLeafSize = RTC_BUILD_MAX_PRIMITIVES_PER_LEAF; + args.traversalCost = 1.0f; + args.intersectionCost = 1.0f; + args.bvh = NULL; + args.primitives = NULL; + args.primitiveCount = 0; + args.primitiveArrayCapacity = 0; + args.createNode = NULL; + args.setNodeChildren = NULL; + args.setNodeBounds = NULL; + args.createLeaf = NULL; + args.splitPrimitive = NULL; + args.buildProgress = NULL; + args.userPtr = NULL; + return args; +} + +/* Creates a new BVH. */ +RTC_API RTCBVH rtcNewBVH(RTCDevice device); + +/* Builds a BVH. */ +RTC_API void* rtcBuildBVH(const struct RTCBuildArguments* args); + +/* Allocates memory using the thread local allocator. */ +RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator allocator, size_t bytes, size_t align); + +/* Retains the BVH (increments reference count). */ +RTC_API void rtcRetainBVH(RTCBVH bvh); + +/* Releases the BVH (decrements reference count). */ +RTC_API void rtcReleaseBVH(RTCBVH bvh); + +RTC_NAMESPACE_END + diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_common.h b/thirdparty/embree-aarch64/include/embree3/rtcore_common.h new file mode 100644 index 00000000000..890e06faa3c --- /dev/null +++ b/thirdparty/embree-aarch64/include/embree3/rtcore_common.h @@ -0,0 +1,326 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include + +#include "rtcore_config.h" + +RTC_NAMESPACE_BEGIN + +#if defined(_WIN32) +#if defined(_M_X64) +typedef long long ssize_t; +#else +typedef int ssize_t; +#endif +#endif + +#if defined(_WIN32) && !defined(__MINGW32__) +# define RTC_ALIGN(...) __declspec(align(__VA_ARGS__)) +#else +# define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__))) +#endif + +#if !defined (RTC_DEPRECATED) +#ifdef __GNUC__ + #define RTC_DEPRECATED __attribute__((deprecated)) +#elif defined(_MSC_VER) + #define RTC_DEPRECATED __declspec(deprecated) +#else + #define RTC_DEPRECATED +#endif +#endif + +#if defined(_WIN32) +# define RTC_FORCEINLINE __forceinline +#else +# define RTC_FORCEINLINE inline __attribute__((always_inline)) +#endif + +/* Invalid geometry ID */ +#define RTC_INVALID_GEOMETRY_ID ((unsigned int)-1) + +/* Maximum number of time steps */ +#define RTC_MAX_TIME_STEP_COUNT 129 + +/* Formats of buffers and other data structures */ +enum RTCFormat +{ + RTC_FORMAT_UNDEFINED = 0, + + /* 8-bit unsigned integer */ + RTC_FORMAT_UCHAR = 0x1001, + RTC_FORMAT_UCHAR2, + RTC_FORMAT_UCHAR3, + RTC_FORMAT_UCHAR4, + + /* 8-bit signed integer */ + RTC_FORMAT_CHAR = 0x2001, + RTC_FORMAT_CHAR2, + RTC_FORMAT_CHAR3, + RTC_FORMAT_CHAR4, + + /* 16-bit unsigned integer */ + RTC_FORMAT_USHORT = 0x3001, + RTC_FORMAT_USHORT2, + RTC_FORMAT_USHORT3, + RTC_FORMAT_USHORT4, + + /* 16-bit signed integer */ + RTC_FORMAT_SHORT = 0x4001, + RTC_FORMAT_SHORT2, + RTC_FORMAT_SHORT3, + RTC_FORMAT_SHORT4, + + /* 32-bit unsigned integer */ + RTC_FORMAT_UINT = 0x5001, + RTC_FORMAT_UINT2, + RTC_FORMAT_UINT3, + RTC_FORMAT_UINT4, + + /* 32-bit signed integer */ + RTC_FORMAT_INT = 0x6001, + RTC_FORMAT_INT2, + RTC_FORMAT_INT3, + RTC_FORMAT_INT4, + + /* 64-bit unsigned integer */ + RTC_FORMAT_ULLONG = 0x7001, + RTC_FORMAT_ULLONG2, + RTC_FORMAT_ULLONG3, + RTC_FORMAT_ULLONG4, + + /* 64-bit signed integer */ + RTC_FORMAT_LLONG = 0x8001, + RTC_FORMAT_LLONG2, + RTC_FORMAT_LLONG3, + RTC_FORMAT_LLONG4, + + /* 32-bit float */ + RTC_FORMAT_FLOAT = 0x9001, + RTC_FORMAT_FLOAT2, + RTC_FORMAT_FLOAT3, + RTC_FORMAT_FLOAT4, + RTC_FORMAT_FLOAT5, + RTC_FORMAT_FLOAT6, + RTC_FORMAT_FLOAT7, + RTC_FORMAT_FLOAT8, + RTC_FORMAT_FLOAT9, + RTC_FORMAT_FLOAT10, + RTC_FORMAT_FLOAT11, + RTC_FORMAT_FLOAT12, + RTC_FORMAT_FLOAT13, + RTC_FORMAT_FLOAT14, + RTC_FORMAT_FLOAT15, + RTC_FORMAT_FLOAT16, + + /* 32-bit float matrix (row-major order) */ + RTC_FORMAT_FLOAT2X2_ROW_MAJOR = 0x9122, + RTC_FORMAT_FLOAT2X3_ROW_MAJOR = 0x9123, + RTC_FORMAT_FLOAT2X4_ROW_MAJOR = 0x9124, + RTC_FORMAT_FLOAT3X2_ROW_MAJOR = 0x9132, + RTC_FORMAT_FLOAT3X3_ROW_MAJOR = 0x9133, + RTC_FORMAT_FLOAT3X4_ROW_MAJOR = 0x9134, + RTC_FORMAT_FLOAT4X2_ROW_MAJOR = 0x9142, + RTC_FORMAT_FLOAT4X3_ROW_MAJOR = 0x9143, + RTC_FORMAT_FLOAT4X4_ROW_MAJOR = 0x9144, + + /* 32-bit float matrix (column-major order) */ + RTC_FORMAT_FLOAT2X2_COLUMN_MAJOR = 0x9222, + RTC_FORMAT_FLOAT2X3_COLUMN_MAJOR = 0x9223, + RTC_FORMAT_FLOAT2X4_COLUMN_MAJOR = 0x9224, + RTC_FORMAT_FLOAT3X2_COLUMN_MAJOR = 0x9232, + RTC_FORMAT_FLOAT3X3_COLUMN_MAJOR = 0x9233, + RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR = 0x9234, + RTC_FORMAT_FLOAT4X2_COLUMN_MAJOR = 0x9242, + RTC_FORMAT_FLOAT4X3_COLUMN_MAJOR = 0x9243, + RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR = 0x9244, + + /* special 12-byte format for grids */ + RTC_FORMAT_GRID = 0xA001 +}; + +/* Build quality levels */ +enum RTCBuildQuality +{ + RTC_BUILD_QUALITY_LOW = 0, + RTC_BUILD_QUALITY_MEDIUM = 1, + RTC_BUILD_QUALITY_HIGH = 2, + RTC_BUILD_QUALITY_REFIT = 3, +}; + +/* Axis-aligned bounding box representation */ +struct RTC_ALIGN(16) RTCBounds +{ + float lower_x, lower_y, lower_z, align0; + float upper_x, upper_y, upper_z, align1; +}; + +/* Linear axis-aligned bounding box representation */ +struct RTC_ALIGN(16) RTCLinearBounds +{ + struct RTCBounds bounds0; + struct RTCBounds bounds1; +}; + +/* Intersection context flags */ +enum RTCIntersectContextFlags +{ + RTC_INTERSECT_CONTEXT_FLAG_NONE = 0, + RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT = (0 << 0), // optimize for incoherent rays + RTC_INTERSECT_CONTEXT_FLAG_COHERENT = (1 << 0) // optimize for coherent rays +}; + +/* Arguments for RTCFilterFunctionN */ +struct RTCFilterFunctionNArguments +{ + int* valid; + void* geometryUserPtr; + struct RTCIntersectContext* context; + struct RTCRayN* ray; + struct RTCHitN* hit; + unsigned int N; +}; + +/* Filter callback function */ +typedef void (*RTCFilterFunctionN)(const struct RTCFilterFunctionNArguments* args); + +/* Intersection context passed to intersect/occluded calls */ +struct RTCIntersectContext +{ + enum RTCIntersectContextFlags flags; // intersection flags + RTCFilterFunctionN filter; // filter function to execute + +#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1 + unsigned int instStackSize; // Number of instances currently on the stack. +#endif + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // The current stack of instance ids. + +#if RTC_MIN_WIDTH + float minWidthDistanceFactor; // curve radius is set to this factor times distance to ray origin +#endif +}; + +/* Initializes an intersection context. */ +RTC_FORCEINLINE void rtcInitIntersectContext(struct RTCIntersectContext* context) +{ + unsigned l = 0; + context->flags = RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT; + context->filter = NULL; + +#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1 + context->instStackSize = 0; +#endif + for (; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + context->instID[l] = RTC_INVALID_GEOMETRY_ID; + +#if RTC_MIN_WIDTH + context->minWidthDistanceFactor = 0.0f; +#endif +} + +/* Point query structure for closest point query */ +struct RTC_ALIGN(16) RTCPointQuery +{ + float x; // x coordinate of the query point + float y; // y coordinate of the query point + float z; // z coordinate of the query point + float time; // time of the point query + float radius; // radius of the point query +}; + +/* Structure of a packet of 4 query points */ +struct RTC_ALIGN(16) RTCPointQuery4 +{ + float x[4]; // x coordinate of the query point + float y[4]; // y coordinate of the query point + float z[4]; // z coordinate of the query point + float time[4]; // time of the point query + float radius[4]; // radius of the point query +}; + +/* Structure of a packet of 8 query points */ +struct RTC_ALIGN(32) RTCPointQuery8 +{ + float x[8]; // x coordinate of the query point + float y[8]; // y coordinate of the query point + float z[8]; // z coordinate of the query point + float time[8]; // time of the point query + float radius[8]; // radius ofr the point query +}; + +/* Structure of a packet of 16 query points */ +struct RTC_ALIGN(64) RTCPointQuery16 +{ + float x[16]; // x coordinate of the query point + float y[16]; // y coordinate of the query point + float z[16]; // z coordinate of the query point + float time[16]; // time of the point quey + float radius[16]; // radius of the point query +}; + +struct RTCPointQueryN; + +struct RTC_ALIGN(16) RTCPointQueryContext +{ + // accumulated 4x4 column major matrices from world space to instance space. + // undefined if size == 0. + float world2inst[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; + + // accumulated 4x4 column major matrices from instance space to world space. + // undefined if size == 0. + float inst2world[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; + + // instance ids. + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; + + // number of instances currently on the stack. + unsigned int instStackSize; +}; + +/* Initializes an intersection context. */ +RTC_FORCEINLINE void rtcInitPointQueryContext(struct RTCPointQueryContext* context) +{ + context->instStackSize = 0; + context->instID[0] = RTC_INVALID_GEOMETRY_ID; +} + +struct RTC_ALIGN(16) RTCPointQueryFunctionArguments +{ + // The (world space) query object that was passed as an argument of rtcPointQuery. The + // radius of the query can be decreased inside the callback to shrink the + // search domain. Increasing the radius or modifying the time or position of + // the query results in undefined behaviour. + struct RTCPointQuery* query; + + // Used for user input/output data. Will not be read or modified internally. + void* userPtr; + + // primitive and geometry ID of primitive + unsigned int primID; + unsigned int geomID; + + // the context with transformation and instance ID stack + struct RTCPointQueryContext* context; + + // If the current instance transform M (= context->world2inst[context->instStackSize]) + // is a similarity matrix, i.e there is a constant factor similarityScale such that, + // for all x,y: dist(Mx, My) = similarityScale * dist(x, y), + // The similarity scale is 0, if the current instance transform is not a + // similarity transform and vice versa. The similarity scale allows to compute + // distance information in instance space and scale the distances into world + // space by dividing with the similarity scale, for example, to update the + // query radius. If the current instance transform is not a similarity + // transform (similarityScale = 0), the distance computation has to be + // performed in world space to ensure correctness. if there is no instance + // transform (context->instStackSize == 0), the similarity scale is 1. + float similarityScale; +}; + +typedef bool (*RTCPointQueryFunction)(struct RTCPointQueryFunctionArguments* args); + +RTC_NAMESPACE_END diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_config.h b/thirdparty/embree-aarch64/include/embree3/rtcore_config.h new file mode 100644 index 00000000000..337d4e94872 --- /dev/null +++ b/thirdparty/embree-aarch64/include/embree3/rtcore_config.h @@ -0,0 +1,57 @@ + +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define RTC_VERSION_MAJOR 3 +#define RTC_VERSION_MINOR 12 +#define RTC_VERSION_PATCH 1 +#define RTC_VERSION 31201 +#define RTC_VERSION_STRING "3.12.1" + +#define RTC_MAX_INSTANCE_LEVEL_COUNT 1 + +#define EMBREE_MIN_WIDTH 0 +#define RTC_MIN_WIDTH EMBREE_MIN_WIDTH + +#define EMBREE_STATIC_LIB +/* #undef EMBREE_API_NAMESPACE */ + +#if defined(EMBREE_API_NAMESPACE) +# define RTC_NAMESPACE +# define RTC_NAMESPACE_BEGIN namespace { +# define RTC_NAMESPACE_END } +# define RTC_NAMESPACE_USE using namespace ; +# define RTC_API_EXTERN_C +# undef EMBREE_API_NAMESPACE +#else +# define RTC_NAMESPACE_BEGIN +# define RTC_NAMESPACE_END +# define RTC_NAMESPACE_USE +# if defined(__cplusplus) +# define RTC_API_EXTERN_C extern "C" +# else +# define RTC_API_EXTERN_C +# endif +#endif + +#if defined(ISPC) +# define RTC_API_IMPORT extern "C" unmasked +# define RTC_API_EXPORT extern "C" unmasked +#elif defined(EMBREE_STATIC_LIB) +# define RTC_API_IMPORT RTC_API_EXTERN_C +# define RTC_API_EXPORT RTC_API_EXTERN_C +#elif defined(_WIN32) +# define RTC_API_IMPORT RTC_API_EXTERN_C __declspec(dllimport) +# define RTC_API_EXPORT RTC_API_EXTERN_C __declspec(dllexport) +#else +# define RTC_API_IMPORT RTC_API_EXTERN_C +# define RTC_API_EXPORT RTC_API_EXTERN_C __attribute__ ((visibility ("default"))) +#endif + +#if defined(RTC_EXPORT_API) +# define RTC_API RTC_API_EXPORT +#else +# define RTC_API RTC_API_IMPORT +#endif diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_device.h b/thirdparty/embree-aarch64/include/embree3/rtcore_device.h new file mode 100644 index 00000000000..594e2b755dc --- /dev/null +++ b/thirdparty/embree-aarch64/include/embree3/rtcore_device.h @@ -0,0 +1,87 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_common.h" + +RTC_NAMESPACE_BEGIN + +/* Opaque device type */ +typedef struct RTCDeviceTy* RTCDevice; + +/* Creates a new Embree device. */ +RTC_API RTCDevice rtcNewDevice(const char* config); + +/* Retains the Embree device (increments the reference count). */ +RTC_API void rtcRetainDevice(RTCDevice device); + +/* Releases an Embree device (decrements the reference count). */ +RTC_API void rtcReleaseDevice(RTCDevice device); + +/* Device properties */ +enum RTCDeviceProperty +{ + RTC_DEVICE_PROPERTY_VERSION = 0, + RTC_DEVICE_PROPERTY_VERSION_MAJOR = 1, + RTC_DEVICE_PROPERTY_VERSION_MINOR = 2, + RTC_DEVICE_PROPERTY_VERSION_PATCH = 3, + + RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED = 32, + RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED = 33, + RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED = 34, + RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED = 35, + + RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED = 63, + RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED = 64, + RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED = 65, + RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED = 66, + RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED = 67, + RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED = 68, + + RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED = 96, + RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED = 97, + RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED = 98, + RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED = 99, + RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED = 100, + RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED = 101, + + RTC_DEVICE_PROPERTY_TASKING_SYSTEM = 128, + RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED = 129, + RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED = 130 +}; + +/* Gets a device property. */ +RTC_API ssize_t rtcGetDeviceProperty(RTCDevice device, enum RTCDeviceProperty prop); + +/* Sets a device property. */ +RTC_API void rtcSetDeviceProperty(RTCDevice device, const enum RTCDeviceProperty prop, ssize_t value); + +/* Error codes */ +enum RTCError +{ + RTC_ERROR_NONE = 0, + RTC_ERROR_UNKNOWN = 1, + RTC_ERROR_INVALID_ARGUMENT = 2, + RTC_ERROR_INVALID_OPERATION = 3, + RTC_ERROR_OUT_OF_MEMORY = 4, + RTC_ERROR_UNSUPPORTED_CPU = 5, + RTC_ERROR_CANCELLED = 6 +}; + +/* Returns the error code. */ +RTC_API enum RTCError rtcGetDeviceError(RTCDevice device); + +/* Error callback function */ +typedef void (*RTCErrorFunction)(void* userPtr, enum RTCError code, const char* str); + +/* Sets the error callback function. */ +RTC_API void rtcSetDeviceErrorFunction(RTCDevice device, RTCErrorFunction error, void* userPtr); + +/* Memory monitor callback function */ +typedef bool (*RTCMemoryMonitorFunction)(void* ptr, ssize_t bytes, bool post); + +/* Sets the memory monitor callback function. */ +RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice device, RTCMemoryMonitorFunction memoryMonitor, void* userPtr); + +RTC_NAMESPACE_END diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_geometry.h b/thirdparty/embree-aarch64/include/embree3/rtcore_geometry.h new file mode 100644 index 00000000000..c70f1b0e5cb --- /dev/null +++ b/thirdparty/embree-aarch64/include/embree3/rtcore_geometry.h @@ -0,0 +1,383 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_buffer.h" +#include "rtcore_quaternion.h" + +RTC_NAMESPACE_BEGIN + +/* Opaque scene type */ +typedef struct RTCSceneTy* RTCScene; + +/* Opaque geometry type */ +typedef struct RTCGeometryTy* RTCGeometry; + +/* Types of geometries */ +enum RTCGeometryType +{ + RTC_GEOMETRY_TYPE_TRIANGLE = 0, // triangle mesh + RTC_GEOMETRY_TYPE_QUAD = 1, // quad (triangle pair) mesh + RTC_GEOMETRY_TYPE_GRID = 2, // grid mesh + + RTC_GEOMETRY_TYPE_SUBDIVISION = 8, // Catmull-Clark subdivision surface + + RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE = 15, // Cone linear curves - discontinuous at edge boundaries + RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE = 16, // Round (rounded cone like) linear curves + RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE = 17, // flat (ribbon-like) linear curves + + RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE = 24, // round (tube-like) Bezier curves + RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE = 25, // flat (ribbon-like) Bezier curves + RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE = 26, // flat normal-oriented Bezier curves + + RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE = 32, // round (tube-like) B-spline curves + RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE = 33, // flat (ribbon-like) B-spline curves + RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE = 34, // flat normal-oriented B-spline curves + + RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE = 40, // round (tube-like) Hermite curves + RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE = 41, // flat (ribbon-like) Hermite curves + RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE = 42, // flat normal-oriented Hermite curves + + RTC_GEOMETRY_TYPE_SPHERE_POINT = 50, + RTC_GEOMETRY_TYPE_DISC_POINT = 51, + RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT = 52, + + RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE = 58, // round (tube-like) Catmull-Rom curves + RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE = 59, // flat (ribbon-like) Catmull-Rom curves + RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE = 60, // flat normal-oriented Catmull-Rom curves + + RTC_GEOMETRY_TYPE_USER = 120, // user-defined geometry + RTC_GEOMETRY_TYPE_INSTANCE = 121 // scene instance +}; + +/* Interpolation modes for subdivision surfaces */ +enum RTCSubdivisionMode +{ + RTC_SUBDIVISION_MODE_NO_BOUNDARY = 0, + RTC_SUBDIVISION_MODE_SMOOTH_BOUNDARY = 1, + RTC_SUBDIVISION_MODE_PIN_CORNERS = 2, + RTC_SUBDIVISION_MODE_PIN_BOUNDARY = 3, + RTC_SUBDIVISION_MODE_PIN_ALL = 4, +}; + +/* Curve segment flags */ +enum RTCCurveFlags +{ + RTC_CURVE_FLAG_NEIGHBOR_LEFT = (1 << 0), // left segments exists + RTC_CURVE_FLAG_NEIGHBOR_RIGHT = (1 << 1) // right segment exists +}; + +/* Arguments for RTCBoundsFunction */ +struct RTCBoundsFunctionArguments +{ + void* geometryUserPtr; + unsigned int primID; + unsigned int timeStep; + struct RTCBounds* bounds_o; +}; + +/* Bounding callback function */ +typedef void (*RTCBoundsFunction)(const struct RTCBoundsFunctionArguments* args); + +/* Arguments for RTCIntersectFunctionN */ +struct RTCIntersectFunctionNArguments +{ + int* valid; + void* geometryUserPtr; + unsigned int primID; + struct RTCIntersectContext* context; + struct RTCRayHitN* rayhit; + unsigned int N; + unsigned int geomID; +}; + +/* Intersection callback function */ +typedef void (*RTCIntersectFunctionN)(const struct RTCIntersectFunctionNArguments* args); + +/* Arguments for RTCOccludedFunctionN */ +struct RTCOccludedFunctionNArguments +{ + int* valid; + void* geometryUserPtr; + unsigned int primID; + struct RTCIntersectContext* context; + struct RTCRayN* ray; + unsigned int N; + unsigned int geomID; +}; + +/* Occlusion callback function */ +typedef void (*RTCOccludedFunctionN)(const struct RTCOccludedFunctionNArguments* args); + +/* Arguments for RTCDisplacementFunctionN */ +struct RTCDisplacementFunctionNArguments +{ + void* geometryUserPtr; + RTCGeometry geometry; + unsigned int primID; + unsigned int timeStep; + const float* u; + const float* v; + const float* Ng_x; + const float* Ng_y; + const float* Ng_z; + float* P_x; + float* P_y; + float* P_z; + unsigned int N; +}; + +/* Displacement mapping callback function */ +typedef void (*RTCDisplacementFunctionN)(const struct RTCDisplacementFunctionNArguments* args); + +/* Creates a new geometry of specified type. */ +RTC_API RTCGeometry rtcNewGeometry(RTCDevice device, enum RTCGeometryType type); + +/* Retains the geometry (increments the reference count). */ +RTC_API void rtcRetainGeometry(RTCGeometry geometry); + +/* Releases the geometry (decrements the reference count) */ +RTC_API void rtcReleaseGeometry(RTCGeometry geometry); + +/* Commits the geometry. */ +RTC_API void rtcCommitGeometry(RTCGeometry geometry); + + +/* Enables the geometry. */ +RTC_API void rtcEnableGeometry(RTCGeometry geometry); + +/* Disables the geometry. */ +RTC_API void rtcDisableGeometry(RTCGeometry geometry); + + +/* Sets the number of motion blur time steps of the geometry. */ +RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry geometry, unsigned int timeStepCount); + +/* Sets the motion blur time range of the geometry. */ +RTC_API void rtcSetGeometryTimeRange(RTCGeometry geometry, float startTime, float endTime); + +/* Sets the number of vertex attributes of the geometry. */ +RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry geometry, unsigned int vertexAttributeCount); + +/* Sets the ray mask of the geometry. */ +RTC_API void rtcSetGeometryMask(RTCGeometry geometry, unsigned int mask); + +/* Sets the build quality of the geometry. */ +RTC_API void rtcSetGeometryBuildQuality(RTCGeometry geometry, enum RTCBuildQuality quality); + +/* Sets the maximal curve or point radius scale allowed by min-width feature. */ +RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry geometry, float maxRadiusScale); + + +/* Sets a geometry buffer. */ +RTC_API void rtcSetGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, RTCBuffer buffer, size_t byteOffset, size_t byteStride, size_t itemCount); + +/* Sets a shared geometry buffer. */ +RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount); + +/* Creates and sets a new geometry buffer. */ +RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, size_t byteStride, size_t itemCount); + +/* Returns the pointer to the data of a buffer. */ +RTC_API void* rtcGetGeometryBufferData(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot); + +/* Updates a geometry buffer. */ +RTC_API void rtcUpdateGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot); + + +/* Sets the intersection filter callback function of the geometry. */ +RTC_API void rtcSetGeometryIntersectFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter); + +/* Sets the occlusion filter callback function of the geometry. */ +RTC_API void rtcSetGeometryOccludedFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter); + +/* Sets the user-defined data pointer of the geometry. */ +RTC_API void rtcSetGeometryUserData(RTCGeometry geometry, void* ptr); + +/* Gets the user-defined data pointer of the geometry. */ +RTC_API void* rtcGetGeometryUserData(RTCGeometry geometry); + +/* Set the point query callback function of a geometry. */ +RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry geometry, RTCPointQueryFunction pointQuery); + +/* Sets the number of primitives of a user geometry. */ +RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry geometry, unsigned int userPrimitiveCount); + +/* Sets the bounding callback function to calculate bounding boxes for user primitives. */ +RTC_API void rtcSetGeometryBoundsFunction(RTCGeometry geometry, RTCBoundsFunction bounds, void* userPtr); + +/* Set the intersect callback function of a user geometry. */ +RTC_API void rtcSetGeometryIntersectFunction(RTCGeometry geometry, RTCIntersectFunctionN intersect); + +/* Set the occlusion callback function of a user geometry. */ +RTC_API void rtcSetGeometryOccludedFunction(RTCGeometry geometry, RTCOccludedFunctionN occluded); + +/* Invokes the intersection filter from the intersection callback function. */ +RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs); + +/* Invokes the occlusion filter from the occlusion callback function. */ +RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs); + + +/* Sets the instanced scene of an instance geometry. */ +RTC_API void rtcSetGeometryInstancedScene(RTCGeometry geometry, RTCScene scene); + +/* Sets the transformation of an instance for the specified time step. */ +RTC_API void rtcSetGeometryTransform(RTCGeometry geometry, unsigned int timeStep, enum RTCFormat format, const void* xfm); + +/* Sets the transformation quaternion of an instance for the specified time step. */ +RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry geometry, unsigned int timeStep, const struct RTCQuaternionDecomposition* qd); + +/* Returns the interpolated transformation of an instance for the specified time. */ +RTC_API void rtcGetGeometryTransform(RTCGeometry geometry, float time, enum RTCFormat format, void* xfm); + + +/* Sets the uniform tessellation rate of the geometry. */ +RTC_API void rtcSetGeometryTessellationRate(RTCGeometry geometry, float tessellationRate); + +/* Sets the number of topologies of a subdivision surface. */ +RTC_API void rtcSetGeometryTopologyCount(RTCGeometry geometry, unsigned int topologyCount); + +/* Sets the subdivision interpolation mode. */ +RTC_API void rtcSetGeometrySubdivisionMode(RTCGeometry geometry, unsigned int topologyID, enum RTCSubdivisionMode mode); + +/* Binds a vertex attribute to a topology of the geometry. */ +RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry geometry, unsigned int vertexAttributeID, unsigned int topologyID); + +/* Sets the displacement callback function of a subdivision surface. */ +RTC_API void rtcSetGeometryDisplacementFunction(RTCGeometry geometry, RTCDisplacementFunctionN displacement); + +/* Returns the first half edge of a face. */ +RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry geometry, unsigned int faceID); + +/* Returns the face the half edge belongs to. */ +RTC_API unsigned int rtcGetGeometryFace(RTCGeometry geometry, unsigned int edgeID); + +/* Returns next half edge. */ +RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry geometry, unsigned int edgeID); + +/* Returns previous half edge. */ +RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry geometry, unsigned int edgeID); + +/* Returns opposite half edge. */ +RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry geometry, unsigned int topologyID, unsigned int edgeID); + + +/* Arguments for rtcInterpolate */ +struct RTCInterpolateArguments +{ + RTCGeometry geometry; + unsigned int primID; + float u; + float v; + enum RTCBufferType bufferType; + unsigned int bufferSlot; + float* P; + float* dPdu; + float* dPdv; + float* ddPdudu; + float* ddPdvdv; + float* ddPdudv; + unsigned int valueCount; +}; + +/* Interpolates vertex data to some u/v location and optionally calculates all derivatives. */ +RTC_API void rtcInterpolate(const struct RTCInterpolateArguments* args); + +/* Interpolates vertex data to some u/v location. */ +RTC_FORCEINLINE void rtcInterpolate0(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot, float* P, unsigned int valueCount) +{ + struct RTCInterpolateArguments args; + args.geometry = geometry; + args.primID = primID; + args.u = u; + args.v = v; + args.bufferType = bufferType; + args.bufferSlot = bufferSlot; + args.P = P; + args.dPdu = NULL; + args.dPdv = NULL; + args.ddPdudu = NULL; + args.ddPdvdv = NULL; + args.ddPdudv = NULL; + args.valueCount = valueCount; + rtcInterpolate(&args); +} + +/* Interpolates vertex data to some u/v location and calculates first order derivatives. */ +RTC_FORCEINLINE void rtcInterpolate1(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot, + float* P, float* dPdu, float* dPdv, unsigned int valueCount) +{ + struct RTCInterpolateArguments args; + args.geometry = geometry; + args.primID = primID; + args.u = u; + args.v = v; + args.bufferType = bufferType; + args.bufferSlot = bufferSlot; + args.P = P; + args.dPdu = dPdu; + args.dPdv = dPdv; + args.ddPdudu = NULL; + args.ddPdvdv = NULL; + args.ddPdudv = NULL; + args.valueCount = valueCount; + rtcInterpolate(&args); +} + +/* Interpolates vertex data to some u/v location and calculates first and second order derivatives. */ +RTC_FORCEINLINE void rtcInterpolate2(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, unsigned int valueCount) +{ + struct RTCInterpolateArguments args; + args.geometry = geometry; + args.primID = primID; + args.u = u; + args.v = v; + args.bufferType = bufferType; + args.bufferSlot = bufferSlot; + args.P = P; + args.dPdu = dPdu; + args.dPdv = dPdv; + args.ddPdudu = ddPdudu; + args.ddPdvdv = ddPdvdv; + args.ddPdudv = ddPdudv; + args.valueCount = valueCount; + rtcInterpolate(&args); +} + +/* Arguments for rtcInterpolateN */ +struct RTCInterpolateNArguments +{ + RTCGeometry geometry; + const void* valid; + const unsigned int* primIDs; + const float* u; + const float* v; + unsigned int N; + enum RTCBufferType bufferType; + unsigned int bufferSlot; + float* P; + float* dPdu; + float* dPdv; + float* ddPdudu; + float* ddPdvdv; + float* ddPdudv; + unsigned int valueCount; +}; + +/* Interpolates vertex data to an array of u/v locations. */ +RTC_API void rtcInterpolateN(const struct RTCInterpolateNArguments* args); + +/* RTCGrid primitive for grid mesh */ +struct RTCGrid +{ + unsigned int startVertexID; + unsigned int stride; + unsigned short width,height; // max is a 32k x 32k grid +}; + +RTC_NAMESPACE_END + + diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_quaternion.h b/thirdparty/embree-aarch64/include/embree3/rtcore_quaternion.h new file mode 100644 index 00000000000..449cdedfdcf --- /dev/null +++ b/thirdparty/embree-aarch64/include/embree3/rtcore_quaternion.h @@ -0,0 +1,101 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_common.h" + +RTC_NAMESPACE_BEGIN + +/* + * Structure for transformation respresentation as a matrix decomposition using + * a quaternion + */ +struct RTC_ALIGN(16) RTCQuaternionDecomposition +{ + float scale_x; + float scale_y; + float scale_z; + float skew_xy; + float skew_xz; + float skew_yz; + float shift_x; + float shift_y; + float shift_z; + float quaternion_r; + float quaternion_i; + float quaternion_j; + float quaternion_k; + float translation_x; + float translation_y; + float translation_z; +}; + +RTC_FORCEINLINE void rtcInitQuaternionDecomposition(struct RTCQuaternionDecomposition* qdecomp) +{ + qdecomp->scale_x = 1.f; + qdecomp->scale_y = 1.f; + qdecomp->scale_z = 1.f; + qdecomp->skew_xy = 0.f; + qdecomp->skew_xz = 0.f; + qdecomp->skew_yz = 0.f; + qdecomp->shift_x = 0.f; + qdecomp->shift_y = 0.f; + qdecomp->shift_z = 0.f; + qdecomp->quaternion_r = 1.f; + qdecomp->quaternion_i = 0.f; + qdecomp->quaternion_j = 0.f; + qdecomp->quaternion_k = 0.f; + qdecomp->translation_x = 0.f; + qdecomp->translation_y = 0.f; + qdecomp->translation_z = 0.f; +} + +RTC_FORCEINLINE void rtcQuaternionDecompositionSetQuaternion( + struct RTCQuaternionDecomposition* qdecomp, + float r, float i, float j, float k) +{ + qdecomp->quaternion_r = r; + qdecomp->quaternion_i = i; + qdecomp->quaternion_j = j; + qdecomp->quaternion_k = k; +} + +RTC_FORCEINLINE void rtcQuaternionDecompositionSetScale( + struct RTCQuaternionDecomposition* qdecomp, + float scale_x, float scale_y, float scale_z) +{ + qdecomp->scale_x = scale_x; + qdecomp->scale_y = scale_y; + qdecomp->scale_z = scale_z; +} + +RTC_FORCEINLINE void rtcQuaternionDecompositionSetSkew( + struct RTCQuaternionDecomposition* qdecomp, + float skew_xy, float skew_xz, float skew_yz) +{ + qdecomp->skew_xy = skew_xy; + qdecomp->skew_xz = skew_xz; + qdecomp->skew_yz = skew_yz; +} + +RTC_FORCEINLINE void rtcQuaternionDecompositionSetShift( + struct RTCQuaternionDecomposition* qdecomp, + float shift_x, float shift_y, float shift_z) +{ + qdecomp->shift_x = shift_x; + qdecomp->shift_y = shift_y; + qdecomp->shift_z = shift_z; +} + +RTC_FORCEINLINE void rtcQuaternionDecompositionSetTranslation( + struct RTCQuaternionDecomposition* qdecomp, + float translation_x, float translation_y, float translation_z) +{ + qdecomp->translation_x = translation_x; + qdecomp->translation_y = translation_y; + qdecomp->translation_z = translation_z; +} + +RTC_NAMESPACE_END + diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_ray.h b/thirdparty/embree-aarch64/include/embree3/rtcore_ray.h new file mode 100644 index 00000000000..1ae3309ef1f --- /dev/null +++ b/thirdparty/embree-aarch64/include/embree3/rtcore_ray.h @@ -0,0 +1,378 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_common.h" + +RTC_NAMESPACE_BEGIN + +/* Ray structure for a single ray */ +struct RTC_ALIGN(16) RTCRay +{ + float org_x; // x coordinate of ray origin + float org_y; // y coordinate of ray origin + float org_z; // z coordinate of ray origin + float tnear; // start of ray segment + + float dir_x; // x coordinate of ray direction + float dir_y; // y coordinate of ray direction + float dir_z; // z coordinate of ray direction + float time; // time of this ray for motion blur + + float tfar; // end of ray segment (set to hit distance) + unsigned int mask; // ray mask + unsigned int id; // ray ID + unsigned int flags; // ray flags +}; + +/* Hit structure for a single ray */ +struct RTC_ALIGN(16) RTCHit +{ + float Ng_x; // x coordinate of geometry normal + float Ng_y; // y coordinate of geometry normal + float Ng_z; // z coordinate of geometry normal + + float u; // barycentric u coordinate of hit + float v; // barycentric v coordinate of hit + + unsigned int primID; // primitive ID + unsigned int geomID; // geometry ID + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID +}; + +/* Combined ray/hit structure for a single ray */ +struct RTCRayHit +{ + struct RTCRay ray; + struct RTCHit hit; +}; + +/* Ray structure for a packet of 4 rays */ +struct RTC_ALIGN(16) RTCRay4 +{ + float org_x[4]; + float org_y[4]; + float org_z[4]; + float tnear[4]; + + float dir_x[4]; + float dir_y[4]; + float dir_z[4]; + float time[4]; + + float tfar[4]; + unsigned int mask[4]; + unsigned int id[4]; + unsigned int flags[4]; +}; + +/* Hit structure for a packet of 4 rays */ +struct RTC_ALIGN(16) RTCHit4 +{ + float Ng_x[4]; + float Ng_y[4]; + float Ng_z[4]; + + float u[4]; + float v[4]; + + unsigned int primID[4]; + unsigned int geomID[4]; + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][4]; +}; + +/* Combined ray/hit structure for a packet of 4 rays */ +struct RTCRayHit4 +{ + struct RTCRay4 ray; + struct RTCHit4 hit; +}; + +/* Ray structure for a packet of 8 rays */ +struct RTC_ALIGN(32) RTCRay8 +{ + float org_x[8]; + float org_y[8]; + float org_z[8]; + float tnear[8]; + + float dir_x[8]; + float dir_y[8]; + float dir_z[8]; + float time[8]; + + float tfar[8]; + unsigned int mask[8]; + unsigned int id[8]; + unsigned int flags[8]; +}; + +/* Hit structure for a packet of 8 rays */ +struct RTC_ALIGN(32) RTCHit8 +{ + float Ng_x[8]; + float Ng_y[8]; + float Ng_z[8]; + + float u[8]; + float v[8]; + + unsigned int primID[8]; + unsigned int geomID[8]; + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][8]; +}; + +/* Combined ray/hit structure for a packet of 8 rays */ +struct RTCRayHit8 +{ + struct RTCRay8 ray; + struct RTCHit8 hit; +}; + +/* Ray structure for a packet of 16 rays */ +struct RTC_ALIGN(64) RTCRay16 +{ + float org_x[16]; + float org_y[16]; + float org_z[16]; + float tnear[16]; + + float dir_x[16]; + float dir_y[16]; + float dir_z[16]; + float time[16]; + + float tfar[16]; + unsigned int mask[16]; + unsigned int id[16]; + unsigned int flags[16]; +}; + +/* Hit structure for a packet of 16 rays */ +struct RTC_ALIGN(64) RTCHit16 +{ + float Ng_x[16]; + float Ng_y[16]; + float Ng_z[16]; + + float u[16]; + float v[16]; + + unsigned int primID[16]; + unsigned int geomID[16]; + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][16]; +}; + +/* Combined ray/hit structure for a packet of 16 rays */ +struct RTCRayHit16 +{ + struct RTCRay16 ray; + struct RTCHit16 hit; +}; + +/* Ray structure for a packet/stream of N rays in pointer SOA layout */ +struct RTCRayNp +{ + float* org_x; + float* org_y; + float* org_z; + float* tnear; + + float* dir_x; + float* dir_y; + float* dir_z; + float* time; + + float* tfar; + unsigned int* mask; + unsigned int* id; + unsigned int* flags; +}; + +/* Hit structure for a packet/stream of N rays in pointer SOA layout */ +struct RTCHitNp +{ + float* Ng_x; + float* Ng_y; + float* Ng_z; + + float* u; + float* v; + + unsigned int* primID; + unsigned int* geomID; + unsigned int* instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; +}; + +/* Combined ray/hit structure for a packet/stream of N rays in pointer SOA layout */ +struct RTCRayHitNp +{ + struct RTCRayNp ray; + struct RTCHitNp hit; +}; + +struct RTCRayN; +struct RTCHitN; +struct RTCRayHitN; + +#if defined(__cplusplus) + +/* Helper functions to access ray packets of runtime size N */ +RTC_FORCEINLINE float& RTCRayN_org_x(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[0*N+i]; } +RTC_FORCEINLINE float& RTCRayN_org_y(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[1*N+i]; } +RTC_FORCEINLINE float& RTCRayN_org_z(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[2*N+i]; } +RTC_FORCEINLINE float& RTCRayN_tnear(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[3*N+i]; } + +RTC_FORCEINLINE float& RTCRayN_dir_x(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[4*N+i]; } +RTC_FORCEINLINE float& RTCRayN_dir_y(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[5*N+i]; } +RTC_FORCEINLINE float& RTCRayN_dir_z(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[6*N+i]; } +RTC_FORCEINLINE float& RTCRayN_time (RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[7*N+i]; } + +RTC_FORCEINLINE float& RTCRayN_tfar (RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[8*N+i]; } +RTC_FORCEINLINE unsigned int& RTCRayN_mask (RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[9*N+i]; } +RTC_FORCEINLINE unsigned int& RTCRayN_id (RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[10*N+i]; } +RTC_FORCEINLINE unsigned int& RTCRayN_flags(RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[11*N+i]; } + +/* Helper functions to access hit packets of runtime size N */ +RTC_FORCEINLINE float& RTCHitN_Ng_x(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[0*N+i]; } +RTC_FORCEINLINE float& RTCHitN_Ng_y(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[1*N+i]; } +RTC_FORCEINLINE float& RTCHitN_Ng_z(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[2*N+i]; } + +RTC_FORCEINLINE float& RTCHitN_u(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[3*N+i]; } +RTC_FORCEINLINE float& RTCHitN_v(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[4*N+i]; } + +RTC_FORCEINLINE unsigned int& RTCHitN_primID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[5*N+i]; } +RTC_FORCEINLINE unsigned int& RTCHitN_geomID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[6*N+i]; } +RTC_FORCEINLINE unsigned int& RTCHitN_instID(RTCHitN* hit, unsigned int N, unsigned int i, unsigned int l) { return ((unsigned*)hit)[7*N+i+N*l]; } + +/* Helper functions to extract RTCRayN and RTCHitN from RTCRayHitN */ +RTC_FORCEINLINE RTCRayN* RTCRayHitN_RayN(RTCRayHitN* rayhit, unsigned int N) { return (RTCRayN*)&((float*)rayhit)[0*N]; } +RTC_FORCEINLINE RTCHitN* RTCRayHitN_HitN(RTCRayHitN* rayhit, unsigned int N) { return (RTCHitN*)&((float*)rayhit)[12*N]; } + +/* Helper structure for a ray packet of compile-time size N */ +template +struct RTCRayNt +{ + float org_x[N]; + float org_y[N]; + float org_z[N]; + float tnear[N]; + + float dir_x[N]; + float dir_y[N]; + float dir_z[N]; + float time[N]; + + float tfar[N]; + unsigned int mask[N]; + unsigned int id[N]; + unsigned int flags[N]; +}; + +/* Helper structure for a hit packet of compile-time size N */ +template +struct RTCHitNt +{ + float Ng_x[N]; + float Ng_y[N]; + float Ng_z[N]; + + float u[N]; + float v[N]; + + unsigned int primID[N]; + unsigned int geomID[N]; + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][N]; +}; + +/* Helper structure for a combined ray/hit packet of compile-time size N */ +template +struct RTCRayHitNt +{ + RTCRayNt ray; + RTCHitNt hit; +}; + +RTC_FORCEINLINE RTCRay rtcGetRayFromRayN(RTCRayN* rayN, unsigned int N, unsigned int i) +{ + RTCRay ray; + ray.org_x = RTCRayN_org_x(rayN,N,i); + ray.org_y = RTCRayN_org_y(rayN,N,i); + ray.org_z = RTCRayN_org_z(rayN,N,i); + ray.tnear = RTCRayN_tnear(rayN,N,i); + ray.dir_x = RTCRayN_dir_x(rayN,N,i); + ray.dir_y = RTCRayN_dir_y(rayN,N,i); + ray.dir_z = RTCRayN_dir_z(rayN,N,i); + ray.time = RTCRayN_time(rayN,N,i); + ray.tfar = RTCRayN_tfar(rayN,N,i); + ray.mask = RTCRayN_mask(rayN,N,i); + ray.id = RTCRayN_id(rayN,N,i); + ray.flags = RTCRayN_flags(rayN,N,i); + return ray; +} + +RTC_FORCEINLINE RTCHit rtcGetHitFromHitN(RTCHitN* hitN, unsigned int N, unsigned int i) +{ + RTCHit hit; + hit.Ng_x = RTCHitN_Ng_x(hitN,N,i); + hit.Ng_y = RTCHitN_Ng_y(hitN,N,i); + hit.Ng_z = RTCHitN_Ng_z(hitN,N,i); + hit.u = RTCHitN_u(hitN,N,i); + hit.v = RTCHitN_v(hitN,N,i); + hit.primID = RTCHitN_primID(hitN,N,i); + hit.geomID = RTCHitN_geomID(hitN,N,i); + for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++) + hit.instID[l] = RTCHitN_instID(hitN,N,i,l); + return hit; +} + +RTC_FORCEINLINE void rtcCopyHitToHitN(RTCHitN* hitN, const RTCHit* hit, unsigned int N, unsigned int i) +{ + RTCHitN_Ng_x(hitN,N,i) = hit->Ng_x; + RTCHitN_Ng_y(hitN,N,i) = hit->Ng_y; + RTCHitN_Ng_z(hitN,N,i) = hit->Ng_z; + RTCHitN_u(hitN,N,i) = hit->u; + RTCHitN_v(hitN,N,i) = hit->v; + RTCHitN_primID(hitN,N,i) = hit->primID; + RTCHitN_geomID(hitN,N,i) = hit->geomID; + for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++) + RTCHitN_instID(hitN,N,i,l) = hit->instID[l]; +} + +RTC_FORCEINLINE RTCRayHit rtcGetRayHitFromRayHitN(RTCRayHitN* rayhitN, unsigned int N, unsigned int i) +{ + RTCRayHit rh; + + RTCRayN* ray = RTCRayHitN_RayN(rayhitN,N); + rh.ray.org_x = RTCRayN_org_x(ray,N,i); + rh.ray.org_y = RTCRayN_org_y(ray,N,i); + rh.ray.org_z = RTCRayN_org_z(ray,N,i); + rh.ray.tnear = RTCRayN_tnear(ray,N,i); + rh.ray.dir_x = RTCRayN_dir_x(ray,N,i); + rh.ray.dir_y = RTCRayN_dir_y(ray,N,i); + rh.ray.dir_z = RTCRayN_dir_z(ray,N,i); + rh.ray.time = RTCRayN_time(ray,N,i); + rh.ray.tfar = RTCRayN_tfar(ray,N,i); + rh.ray.mask = RTCRayN_mask(ray,N,i); + rh.ray.id = RTCRayN_id(ray,N,i); + rh.ray.flags = RTCRayN_flags(ray,N,i); + + RTCHitN* hit = RTCRayHitN_HitN(rayhitN,N); + rh.hit.Ng_x = RTCHitN_Ng_x(hit,N,i); + rh.hit.Ng_y = RTCHitN_Ng_y(hit,N,i); + rh.hit.Ng_z = RTCHitN_Ng_z(hit,N,i); + rh.hit.u = RTCHitN_u(hit,N,i); + rh.hit.v = RTCHitN_v(hit,N,i); + rh.hit.primID = RTCHitN_primID(hit,N,i); + rh.hit.geomID = RTCHitN_geomID(hit,N,i); + for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++) + rh.hit.instID[l] = RTCHitN_instID(hit,N,i,l); + + return rh; +} + +#endif + +RTC_NAMESPACE_END + diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_scene.h b/thirdparty/embree-aarch64/include/embree3/rtcore_scene.h new file mode 100644 index 00000000000..0cd64015937 --- /dev/null +++ b/thirdparty/embree-aarch64/include/embree3/rtcore_scene.h @@ -0,0 +1,160 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore_device.h" + +RTC_NAMESPACE_BEGIN + +/* Forward declarations for ray structures */ +struct RTCRayHit; +struct RTCRayHit4; +struct RTCRayHit8; +struct RTCRayHit16; +struct RTCRayHitNp; + +/* Scene flags */ +enum RTCSceneFlags +{ + RTC_SCENE_FLAG_NONE = 0, + RTC_SCENE_FLAG_DYNAMIC = (1 << 0), + RTC_SCENE_FLAG_COMPACT = (1 << 1), + RTC_SCENE_FLAG_ROBUST = (1 << 2), + RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION = (1 << 3) +}; + +/* Creates a new scene. */ +RTC_API RTCScene rtcNewScene(RTCDevice device); + +/* Returns the device the scene got created in. The reference count of + * the device is incremented by this function. */ +RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene); + +/* Retains the scene (increments the reference count). */ +RTC_API void rtcRetainScene(RTCScene scene); + +/* Releases the scene (decrements the reference count). */ +RTC_API void rtcReleaseScene(RTCScene scene); + + +/* Attaches the geometry to a scene. */ +RTC_API unsigned int rtcAttachGeometry(RTCScene scene, RTCGeometry geometry); + +/* Attaches the geometry to a scene using the specified geometry ID. */ +RTC_API void rtcAttachGeometryByID(RTCScene scene, RTCGeometry geometry, unsigned int geomID); + +/* Detaches the geometry from the scene. */ +RTC_API void rtcDetachGeometry(RTCScene scene, unsigned int geomID); + +/* Gets a geometry handle from the scene. */ +RTC_API RTCGeometry rtcGetGeometry(RTCScene scene, unsigned int geomID); + + +/* Commits the scene. */ +RTC_API void rtcCommitScene(RTCScene scene); + +/* Commits the scene from multiple threads. */ +RTC_API void rtcJoinCommitScene(RTCScene scene); + + +/* Progress monitor callback function */ +typedef bool (*RTCProgressMonitorFunction)(void* ptr, double n); + +/* Sets the progress monitor callback function of the scene. */ +RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene scene, RTCProgressMonitorFunction progress, void* ptr); + +/* Sets the build quality of the scene. */ +RTC_API void rtcSetSceneBuildQuality(RTCScene scene, enum RTCBuildQuality quality); + +/* Sets the scene flags. */ +RTC_API void rtcSetSceneFlags(RTCScene scene, enum RTCSceneFlags flags); + +/* Returns the scene flags. */ +RTC_API enum RTCSceneFlags rtcGetSceneFlags(RTCScene scene); + +/* Returns the axis-aligned bounds of the scene. */ +RTC_API void rtcGetSceneBounds(RTCScene scene, struct RTCBounds* bounds_o); + +/* Returns the linear axis-aligned bounds of the scene. */ +RTC_API void rtcGetSceneLinearBounds(RTCScene scene, struct RTCLinearBounds* bounds_o); + + +/* Perform a closest point query of the scene. */ +RTC_API bool rtcPointQuery(RTCScene scene, struct RTCPointQuery* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void* userPtr); + +/* Perform a closest point query with a packet of 4 points with the scene. */ +RTC_API bool rtcPointQuery4(const int* valid, RTCScene scene, struct RTCPointQuery4* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr); + +/* Perform a closest point query with a packet of 4 points with the scene. */ +RTC_API bool rtcPointQuery8(const int* valid, RTCScene scene, struct RTCPointQuery8* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr); + +/* Perform a closest point query with a packet of 4 points with the scene. */ +RTC_API bool rtcPointQuery16(const int* valid, RTCScene scene, struct RTCPointQuery16* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr); + +/* Intersects a single ray with the scene. */ +RTC_API void rtcIntersect1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit); + +/* Intersects a packet of 4 rays with the scene. */ +RTC_API void rtcIntersect4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit4* rayhit); + +/* Intersects a packet of 8 rays with the scene. */ +RTC_API void rtcIntersect8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit8* rayhit); + +/* Intersects a packet of 16 rays with the scene. */ +RTC_API void rtcIntersect16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit16* rayhit); + +/* Intersects a stream of M rays with the scene. */ +RTC_API void rtcIntersect1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit, unsigned int M, size_t byteStride); + +/* Intersects a stream of pointers to M rays with the scene. */ +RTC_API void rtcIntersect1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit** rayhit, unsigned int M); + +/* Intersects a stream of M ray packets of size N in SOA format with the scene. */ +RTC_API void rtcIntersectNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride); + +/* Intersects a stream of M ray packets of size N in SOA format with the scene. */ +RTC_API void rtcIntersectNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayHitNp* rayhit, unsigned int N); + +/* Tests a single ray for occlusion with the scene. */ +RTC_API void rtcOccluded1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray); + +/* Tests a packet of 4 rays for occlusion occluded with the scene. */ +RTC_API void rtcOccluded4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay4* ray); + +/* Tests a packet of 8 rays for occlusion with the scene. */ +RTC_API void rtcOccluded8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay8* ray); + +/* Tests a packet of 16 rays for occlusion with the scene. */ +RTC_API void rtcOccluded16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay16* ray); + +/* Tests a stream of M rays for occlusion with the scene. */ +RTC_API void rtcOccluded1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray, unsigned int M, size_t byteStride); + +/* Tests a stream of pointers to M rays for occlusion with the scene. */ +RTC_API void rtcOccluded1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay** ray, unsigned int M); + +/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */ +RTC_API void rtcOccludedNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride); + +/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */ +RTC_API void rtcOccludedNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayNp* ray, unsigned int N); + +/*! collision callback */ +struct RTCCollision { unsigned int geomID0; unsigned int primID0; unsigned int geomID1; unsigned int primID1; }; +typedef void (*RTCCollideFunc) (void* userPtr, struct RTCCollision* collisions, unsigned int num_collisions); + +/*! Performs collision detection of two scenes */ +RTC_API void rtcCollide (RTCScene scene0, RTCScene scene1, RTCCollideFunc callback, void* userPtr); + +#if defined(__cplusplus) + +/* Helper for easily combining scene flags */ +inline RTCSceneFlags operator|(RTCSceneFlags a, RTCSceneFlags b) { + return (RTCSceneFlags)((size_t)a | (size_t)b); +} + +#endif + +RTC_NAMESPACE_END + diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h new file mode 100644 index 00000000000..755ce255fb4 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h @@ -0,0 +1,411 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../bvh/bvh.h" +#include "../geometry/primitive.h" +#include "../builders/bvh_builder_sah.h" +#include "../builders/heuristic_binning_array_aligned.h" +#include "../builders/heuristic_binning_array_unaligned.h" +#include "../builders/heuristic_strand_array.h" + +#define NUM_HAIR_OBJECT_BINS 32 + +namespace embree +{ + namespace isa + { + struct BVHBuilderHair + { + /*! settings for builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), finished_range_threshold(inf) {} + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t logBlockSize; //!< log2 of blocksize for SAH heuristic + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + size_t finished_range_threshold; //!< finished range threshold + }; + + template + + class BuilderT + { + ALIGNED_CLASS_(16); + friend struct BVHBuilderHair; + + typedef FastAllocator::CachedAllocator Allocator; + typedef HeuristicArrayBinningSAH HeuristicBinningSAH; + typedef UnalignedHeuristicArrayBinningSAH UnalignedHeuristicBinningSAH; + typedef HeuristicStrandSplit HeuristicStrandSplitSAH; + + static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth + static const size_t SINGLE_THREADED_THRESHOLD = 4096; //!< threshold to switch to single threaded build + + static const size_t travCostAligned = 1; + static const size_t travCostUnaligned = 5; + static const size_t intCost = 6; + + BuilderT (Scene* scene, + PrimRef* prims, + const CreateAllocFunc& createAlloc, + const CreateAABBNodeFunc& createAABBNode, + const SetAABBNodeFunc& setAABBNode, + const CreateOBBNodeFunc& createOBBNode, + const SetOBBNodeFunc& setOBBNode, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const ReportFinishedRangeFunc& reportFinishedRange, + const Settings settings) + + : cfg(settings), + prims(prims), + createAlloc(createAlloc), + createAABBNode(createAABBNode), + setAABBNode(setAABBNode), + createOBBNode(createOBBNode), + setOBBNode(setOBBNode), + createLeaf(createLeaf), + progressMonitor(progressMonitor), + reportFinishedRange(reportFinishedRange), + alignedHeuristic(prims), unalignedHeuristic(scene,prims), strandHeuristic(scene,prims) {} + + /*! checks if all primitives are from the same geometry */ + __forceinline bool sameGeometry(const PrimInfoRange& range) + { + if (range.size() == 0) return true; + unsigned int firstGeomID = prims[range.begin()].geomID(); + for (size_t i=range.begin()+1; i cfg.maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* create leaf for few primitives */ + if (pinfo.size() <= cfg.maxLeafSize && sameGeometry(pinfo)) + return createLeaf(prims,pinfo,alloc); + + /* fill all children by always splitting the largest one */ + PrimInfoRange children[MAX_BRANCHING_FACTOR]; + unsigned numChildren = 1; + children[0] = pinfo; + + do { + + /* find best child with largest bounding box area */ + int bestChild = -1; + size_t bestSize = 0; + for (unsigned i=0; i bestSize) { + bestSize = children[i].size(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + __aligned(64) PrimInfoRange left, right; + if (!sameGeometry(children[bestChild])) { + alignedHeuristic.splitByGeometry(children[bestChild],left,right); + } else { + alignedHeuristic.splitFallback(children[bestChild],left,right); + } + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = left; + children[numChildren+0] = right; + numChildren++; + + } while (numChildren < cfg.branchingFactor); + + /* create node */ + auto node = createAABBNode(alloc); + + for (size_t i=0; i> cfg.logBlockSize; + const float leafSAH = intCost*float(blocks)*halfArea(pinfo.geomBounds); + + /* try standard binning in aligned space */ + float alignedObjectSAH = inf; + HeuristicBinningSAH::Split alignedObjectSplit; + if (aligned) { + alignedObjectSplit = alignedHeuristic.find(pinfo,cfg.logBlockSize); + alignedObjectSAH = travCostAligned*halfArea(pinfo.geomBounds) + intCost*alignedObjectSplit.splitSAH(); + bestSAH = min(alignedObjectSAH,bestSAH); + } + + /* try standard binning in unaligned space */ + UnalignedHeuristicBinningSAH::Split unalignedObjectSplit; + LinearSpace3fa uspace; + float unalignedObjectSAH = inf; + if (bestSAH > 0.7f*leafSAH) { + uspace = unalignedHeuristic.computeAlignedSpace(pinfo); + const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(pinfo,uspace); + unalignedObjectSplit = unalignedHeuristic.find(sinfo,cfg.logBlockSize,uspace); + unalignedObjectSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*unalignedObjectSplit.splitSAH(); + bestSAH = min(unalignedObjectSAH,bestSAH); + } + + /* try splitting into two strands */ + HeuristicStrandSplitSAH::Split strandSplit; + float strandSAH = inf; + if (bestSAH > 0.7f*leafSAH && pinfo.size() <= 256) { + strandSplit = strandHeuristic.find(pinfo,cfg.logBlockSize); + strandSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*strandSplit.splitSAH(); + bestSAH = min(strandSAH,bestSAH); + } + + /* fallback if SAH heuristics failed */ + if (unlikely(!std::isfinite(bestSAH))) + { + alignedHeuristic.deterministic_order(pinfo); + alignedHeuristic.splitFallback(pinfo,linfo,rinfo); + } + + /* perform aligned split if this is best */ + else if (bestSAH == alignedObjectSAH) { + alignedHeuristic.split(alignedObjectSplit,pinfo,linfo,rinfo); + } + + /* perform unaligned split if this is best */ + else if (bestSAH == unalignedObjectSAH) { + unalignedHeuristic.split(unalignedObjectSplit,uspace,pinfo,linfo,rinfo); + aligned = false; + } + + /* perform strand split if this is best */ + else if (bestSAH == strandSAH) { + strandHeuristic.split(strandSplit,pinfo,linfo,rinfo); + aligned = false; + } + + /* can never happen */ + else + assert(false); + } + + /*! recursive build */ + NodeRef recurse(size_t depth, const PrimInfoRange& pinfo, Allocator alloc, bool toplevel, bool alloc_barrier) + { + /* get thread local allocator */ + if (!alloc) + alloc = createAlloc(); + + /* call memory monitor function to signal progress */ + if (toplevel && pinfo.size() <= SINGLE_THREADED_THRESHOLD) + progressMonitor(pinfo.size()); + + PrimInfoRange children[MAX_BRANCHING_FACTOR]; + + /* create leaf node */ + if (depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || pinfo.size() <= cfg.minLeafSize) { + alignedHeuristic.deterministic_order(pinfo); + return createLargeLeaf(depth,pinfo,alloc); + } + + /* fill all children by always splitting the one with the largest surface area */ + size_t numChildren = 1; + children[0] = pinfo; + bool aligned = true; + + do { + + /* find best child with largest bounding box area */ + ssize_t bestChild = -1; + float bestArea = neg_inf; + for (size_t i=0; i bestArea) { + bestArea = area(children[i].geomBounds); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + PrimInfoRange left, right; + split(children[bestChild],left,right,aligned); + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = left; + children[numChildren+0] = right; + numChildren++; + + } while (numChildren < cfg.branchingFactor); + + NodeRef node; + + /* create aligned node */ + if (aligned) + { + node = createAABBNode(alloc); + + /* spawn tasks or ... */ + if (pinfo.size() > SINGLE_THREADED_THRESHOLD) + { + parallel_for(size_t(0), numChildren, [&] (const range& r) { + for (size_t i=r.begin(); i cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; + setAABBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),children[i].geomBounds); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + } + /* ... continue sequentially */ + else { + for (size_t i=0; i cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; + setAABBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),children[i].geomBounds); + } + } + } + + /* create unaligned node */ + else + { + node = createOBBNode(alloc); + + /* spawn tasks or ... */ + if (pinfo.size() > SINGLE_THREADED_THRESHOLD) + { + parallel_for(size_t(0), numChildren, [&] (const range& r) { + for (size_t i=r.begin(); i cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; + setOBBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),obounds); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + } + /* ... continue sequentially */ + else + { + for (size_t i=0; i cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; + setOBBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),obounds); + } + } + } + + /* reports a finished range of primrefs */ + if (unlikely(alloc_barrier)) + reportFinishedRange(pinfo); + + return node; + } + + private: + Settings cfg; + PrimRef* prims; + const CreateAllocFunc& createAlloc; + const CreateAABBNodeFunc& createAABBNode; + const SetAABBNodeFunc& setAABBNode; + const CreateOBBNodeFunc& createOBBNode; + const SetOBBNodeFunc& setOBBNode; + const CreateLeafFunc& createLeaf; + const ProgressMonitor& progressMonitor; + const ReportFinishedRangeFunc& reportFinishedRange; + + private: + HeuristicBinningSAH alignedHeuristic; + UnalignedHeuristicBinningSAH unalignedHeuristic; + HeuristicStrandSplitSAH strandHeuristic; + }; + + template + + static NodeRef build (const CreateAllocFunc& createAlloc, + const CreateAABBNodeFunc& createAABBNode, + const SetAABBNodeFunc& setAABBNode, + const CreateOBBNodeFunc& createOBBNode, + const SetOBBNodeFunc& setOBBNode, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const ReportFinishedRangeFunc& reportFinishedRange, + Scene* scene, + PrimRef* prims, + const PrimInfo& pinfo, + const Settings settings) + { + typedef BuilderT Builder; + + Builder builder(scene,prims,createAlloc, + createAABBNode,setAABBNode, + createOBBNode,setOBBNode, + createLeaf,progressMonitor,reportFinishedRange,settings); + + NodeRef root = builder.recurse(1,pinfo,nullptr,true,false); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h new file mode 100644 index 00000000000..92be2f7e65c --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h @@ -0,0 +1,501 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/builder.h" +#include "../../common/algorithms/parallel_reduce.h" + +namespace embree +{ + namespace isa + { + struct BVHBuilderMorton + { + static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth + + /*! settings for morton builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {} + + /*! initialize settings from API settings */ + Settings (const RTCBuildArguments& settings) + : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) + { + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor; + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth; + if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize; + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize; + + minLeafSize = min(minLeafSize,maxLeafSize); + } + + Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold) + : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold) + { + minLeafSize = min(minLeafSize,maxLeafSize); + } + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + size_t singleThreadThreshold; //!< threshold when we switch to single threaded build + }; + + /*! Build primitive consisting of morton code and primitive ID. */ + struct __aligned(8) BuildPrim + { + union { + struct { + unsigned int code; //!< morton code + unsigned int index; //!< i'th primitive + }; + uint64_t t; + }; + + /*! interface for radix sort */ + __forceinline operator unsigned() const { return code; } + + /*! interface for standard sort */ + __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; } + }; + + /*! maps bounding box to morton code */ + struct MortonCodeMapping + { + static const size_t LATTICE_BITS_PER_DIM = 10; + static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM; + + vfloat4 base; + vfloat4 scale; + + __forceinline MortonCodeMapping(const BBox3fa& bounds) + { + base = (vfloat4)bounds.lower; + const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower; + scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f)); + } + + __forceinline const vint4 bin (const BBox3fa& box) const + { + const vfloat4 lower = (vfloat4)box.lower; + const vfloat4 upper = (vfloat4)box.upper; + const vfloat4 centroid = lower+upper; + return vint4((centroid-base)*scale); + } + + __forceinline unsigned int code (const BBox3fa& box) const + { + const vint4 binID = bin(box); + const unsigned int x = extract<0>(binID); + const unsigned int y = extract<1>(binID); + const unsigned int z = extract<2>(binID); + const unsigned int xyz = bitInterleave(x,y,z); + return xyz; + } + }; + +#if defined (__AVX2__) + + /*! for AVX2 there is a fast scalar bitInterleave */ + struct MortonCodeGenerator + { + __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest) + : mapping(mapping), dest(dest) {} + + __forceinline void operator() (const BBox3fa& b, const unsigned index) + { + dest->index = index; + dest->code = mapping.code(b); + dest++; + } + + public: + const MortonCodeMapping mapping; + BuildPrim* dest; + size_t currentID; + }; + +#else + + /*! before AVX2 is it better to use the SSE version of bitInterleave */ + struct MortonCodeGenerator + { + __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest) + : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {} + + __forceinline ~MortonCodeGenerator() + { + if (slots != 0) + { + const vint4 code = bitInterleave(ax,ay,az); + for (size_t i=0; i(binID); + ay[slots] = extract<1>(binID); + az[slots] = extract<2>(binID); + ai[slots] = index; + slots++; + currentID++; + + if (slots == 4) + { + const vint4 code = bitInterleave(ax,ay,az); + vint4::storeu(&dest[currentID-4],unpacklo(code,ai)); + vint4::storeu(&dest[currentID-2],unpackhi(code,ai)); + slots = 0; + } + } + + public: + const MortonCodeMapping mapping; + BuildPrim* dest; + size_t currentID; + size_t slots; + vint4 ax, ay, az, ai; + }; + +#endif + + template< + typename ReductionTy, + typename Allocator, + typename CreateAllocator, + typename CreateNodeFunc, + typename SetNodeBoundsFunc, + typename CreateLeafFunc, + typename CalculateBounds, + typename ProgressMonitor> + + class BuilderT : private Settings + { + ALIGNED_CLASS_(16); + + public: + + BuilderT (CreateAllocator& createAllocator, + CreateNodeFunc& createNode, + SetNodeBoundsFunc& setBounds, + CreateLeafFunc& createLeaf, + CalculateBounds& calculateBounds, + ProgressMonitor& progressMonitor, + const Settings& settings) + + : Settings(settings), + createAllocator(createAllocator), + createNode(createNode), + setBounds(setBounds), + createLeaf(createLeaf), + calculateBounds(calculateBounds), + progressMonitor(progressMonitor), + morton(nullptr) {} + + ReductionTy createLargeLeaf(size_t depth, const range& current, Allocator alloc) + { + /* this should never occur but is a fatal error */ + if (depth > maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* create leaf for few primitives */ + if (current.size() <= maxLeafSize) + return createLeaf(current,alloc); + + /* fill all children by always splitting the largest one */ + range children[MAX_BRANCHING_FACTOR]; + size_t numChildren = 1; + children[0] = current; + + do { + + /* find best child with largest number of primitives */ + size_t bestChild = -1; + size_t bestSize = 0; + for (size_t i=0; i bestSize) { + bestSize = children[i].size(); + bestChild = i; + } + } + if (bestChild == size_t(-1)) break; + + /*! split best child into left and right child */ + auto split = children[bestChild].split(); + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = split.first; + children[numChildren+0] = split.second; + numChildren++; + + } while (numChildren < branchingFactor); + + /* create node */ + auto node = createNode(alloc,numChildren); + + /* recurse into each child */ + ReductionTy bounds[MAX_BRANCHING_FACTOR]; + for (size_t i=0; i& current) const + { + /* fast path for small ranges */ + if (likely(current.size() < 1024)) + { + /*! recalculate centroid bounds */ + BBox3fa centBounds(empty); + for (size_t i=current.begin(); i& r ) { + BBox3fa centBounds = empty; + for (size_t i=r.begin(); i& r ) { + for (size_t i=r.begin(); i& current, range& left, range& right) const + { + const unsigned int code_start = morton[current.begin()].code; + const unsigned int code_end = morton[current.end()-1].code; + unsigned int bitpos = lzcnt(code_start^code_end); + + /* if all items mapped to same morton code, then re-create new morton codes for the items */ + if (unlikely(bitpos == 32)) + { + recreateMortonCodes(current); + const unsigned int code_start = morton[current.begin()].code; + const unsigned int code_end = morton[current.end()-1].code; + bitpos = lzcnt(code_start^code_end); + + /* if the morton code is still the same, goto fall back split */ + if (unlikely(bitpos == 32)) { + current.split(left,right); + return; + } + } + + /* split the items at the topmost different morton code bit */ + const unsigned int bitpos_diff = 31-bitpos; + const unsigned int bitmask = 1 << bitpos_diff; + + /* find location where bit differs using binary search */ + unsigned begin = current.begin(); + unsigned end = current.end(); + while (begin + 1 != end) { + const unsigned mid = (begin+end)/2; + const unsigned bit = morton[mid].code & bitmask; + if (bit == 0) begin = mid; else end = mid; + } + unsigned center = end; +#if defined(DEBUG) + for (unsigned int i=begin; i& current, Allocator alloc, bool toplevel) + { + /* get thread local allocator */ + if (!alloc) + alloc = createAllocator(); + + /* call memory monitor function to signal progress */ + if (toplevel && current.size() <= singleThreadThreshold) + progressMonitor(current.size()); + + /* create leaf node */ + if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize)) + return createLargeLeaf(depth,current,alloc); + + /* fill all children by always splitting the one with the largest surface area */ + range children[MAX_BRANCHING_FACTOR]; + split(current,children[0],children[1]); + size_t numChildren = 2; + + while (numChildren < branchingFactor) + { + /* find best child with largest number of primitives */ + int bestChild = -1; + unsigned bestItems = 0; + for (unsigned int i=0; i bestItems) { + bestItems = children[i].size(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + range left, right; + split(children[bestChild],left,right); + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = left; + children[numChildren+0] = right; + numChildren++; + } + + /* create leaf node if no split is possible */ + if (unlikely(numChildren == 1)) + return createLeaf(current,alloc); + + /* allocate node */ + auto node = createNode(alloc,numChildren); + + /* process top parts of tree parallel */ + ReductionTy bounds[MAX_BRANCHING_FACTOR]; + if (current.size() > singleThreadThreshold) + { + /*! parallel_for is faster than spawing sub-tasks */ + parallel_for(size_t(0), numChildren, [&] (const range& r) { + for (size_t i=r.begin(); i(0,(unsigned)numPrimitives), nullptr, true); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + + public: + CreateAllocator& createAllocator; + CreateNodeFunc& createNode; + SetNodeBoundsFunc& setBounds; + CreateLeafFunc& createLeaf; + CalculateBounds& calculateBounds; + ProgressMonitor& progressMonitor; + + public: + BuildPrim* morton; + }; + + + template< + typename ReductionTy, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename SetBoundsFunc, + typename CreateLeafFunc, + typename CalculateBoundsFunc, + typename ProgressMonitor> + + static ReductionTy build(CreateAllocFunc createAllocator, + CreateNodeFunc createNode, + SetBoundsFunc setBounds, + CreateLeafFunc createLeaf, + CalculateBoundsFunc calculateBounds, + ProgressMonitor progressMonitor, + BuildPrim* src, + BuildPrim* tmp, + size_t numPrimitives, + const Settings& settings) + { + typedef BuilderT< + ReductionTy, + decltype(createAllocator()), + CreateAllocFunc, + CreateNodeFunc, + SetBoundsFunc, + CreateLeafFunc, + CalculateBoundsFunc, + ProgressMonitor> Builder; + + Builder builder(createAllocator, + createNode, + setBounds, + createLeaf, + calculateBounds, + progressMonitor, + settings); + + return builder.build(src,tmp,numPrimitives); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h new file mode 100644 index 00000000000..4c138dacdbd --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h @@ -0,0 +1,692 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define MBLUR_NUM_TEMPORAL_BINS 2 +#define MBLUR_NUM_OBJECT_BINS 32 + +#include "../bvh/bvh.h" +#include "../common/primref_mb.h" +#include "heuristic_binning_array_aligned.h" +#include "heuristic_timesplit_array.h" + +namespace embree +{ + namespace isa + { + template + struct SharedVector + { + __forceinline SharedVector() {} + + __forceinline SharedVector(T* ptr, size_t refCount = 1) + : prims(ptr), refCount(refCount) {} + + __forceinline void incRef() { + refCount++; + } + + __forceinline void decRef() + { + if (--refCount == 0) + delete prims; + } + + T* prims; + size_t refCount; + }; + + template + struct LocalChildListT + { + typedef SharedVector> SharedPrimRefVector; + + __forceinline LocalChildListT (const BuildRecord& record) + : numChildren(1), numSharedPrimVecs(1) + { + /* the local root will be freed in the ancestor where it was created (thus refCount is 2) */ + children[0] = record; + primvecs[0] = new (&sharedPrimVecs[0]) SharedPrimRefVector(record.prims.prims, 2); + } + + __forceinline ~LocalChildListT() + { + for (size_t i = 0; i < numChildren; i++) + primvecs[i]->decRef(); + } + + __forceinline BuildRecord& operator[] ( const size_t i ) { + return children[i]; + } + + __forceinline size_t size() const { + return numChildren; + } + + __forceinline void split(ssize_t bestChild, const BuildRecord& lrecord, const BuildRecord& rrecord, std::unique_ptr> new_vector) + { + SharedPrimRefVector* bsharedPrimVec = primvecs[bestChild]; + if (lrecord.prims.prims == bsharedPrimVec->prims) { + primvecs[bestChild] = bsharedPrimVec; + bsharedPrimVec->incRef(); + } + else { + primvecs[bestChild] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(lrecord.prims.prims); + } + + if (rrecord.prims.prims == bsharedPrimVec->prims) { + primvecs[numChildren] = bsharedPrimVec; + bsharedPrimVec->incRef(); + } + else { + primvecs[numChildren] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(rrecord.prims.prims); + } + bsharedPrimVec->decRef(); + new_vector.release(); + + children[bestChild] = lrecord; + children[numChildren] = rrecord; + numChildren++; + } + + public: + array_t children; + array_t primvecs; + size_t numChildren; + + array_t sharedPrimVecs; + size_t numSharedPrimVecs; + }; + + template + struct RecalculatePrimRef + { + Scene* scene; + + __forceinline RecalculatePrimRef (Scene* scene) + : scene(scene) {} + + __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const + { + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const Mesh* mesh = scene->get(geomID); + const LBBox3fa lbounds = mesh->linearBounds(primID, time_range); + const range tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); + } + + // __noinline is workaround for ICC16 bug under MacOSX + __noinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const + { + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const Mesh* mesh = scene->get(geomID); + const LBBox3fa lbounds = mesh->linearBounds(space, primID, time_range); + const range tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); + } + + __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const { + return scene->get(prim.geomID())->linearBounds(prim.primID(), time_range); + } + + // __noinline is workaround for ICC16 bug under MacOSX + __noinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const { + return scene->get(prim.geomID())->linearBounds(space, prim.primID(), time_range); + } + }; + + struct VirtualRecalculatePrimRef + { + Scene* scene; + + __forceinline VirtualRecalculatePrimRef (Scene* scene) + : scene(scene) {} + + __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const + { + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const Geometry* mesh = scene->get(geomID); + const LBBox3fa lbounds = mesh->vlinearBounds(primID, time_range); + const range tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); + } + + __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const + { + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const Geometry* mesh = scene->get(geomID); + const LBBox3fa lbounds = mesh->vlinearBounds(space, primID, time_range); + const range tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); + } + + __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const { + return scene->get(prim.geomID())->vlinearBounds(prim.primID(), time_range); + } + + __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const { + return scene->get(prim.geomID())->vlinearBounds(space, prim.primID(), time_range); + } + }; + + struct BVHBuilderMSMBlur + { + /*! settings for msmblur builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8), + travCost(1.0f), intCost(1.0f), singleLeafTimeSegment(false), + singleThreadThreshold(1024) {} + + + Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold) + : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), + travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold) + { + minLeafSize = min(minLeafSize,maxLeafSize); + } + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t logBlockSize; //!< log2 of blocksize for SAH heuristic + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + float travCost; //!< estimated cost of one traversal step + float intCost; //!< estimated cost of one primitive intersection + bool singleLeafTimeSegment; //!< split time to single time range + size_t singleThreadThreshold; //!< threshold when we switch to single threaded build + }; + + struct BuildRecord + { + public: + __forceinline BuildRecord () {} + + __forceinline BuildRecord (size_t depth) + : depth(depth) {} + + __forceinline BuildRecord (const SetMB& prims, size_t depth) + : depth(depth), prims(prims) {} + + __forceinline friend bool operator< (const BuildRecord& a, const BuildRecord& b) { + return a.prims.size() < b.prims.size(); + } + + __forceinline size_t size() const { + return prims.size(); + } + + public: + size_t depth; //!< Depth of the root of this subtree. + SetMB prims; //!< The list of primitives. + }; + + struct BuildRecordSplit : public BuildRecord + { + __forceinline BuildRecordSplit () {} + + __forceinline BuildRecordSplit (size_t depth) + : BuildRecord(depth) {} + + __forceinline BuildRecordSplit (const BuildRecord& record, const BinSplit& split) + : BuildRecord(record), split(split) {} + + BinSplit split; + }; + + template< + typename NodeRef, + typename RecalculatePrimRef, + typename Allocator, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename SetNodeFunc, + typename CreateLeafFunc, + typename ProgressMonitor> + + class BuilderT + { + ALIGNED_CLASS_(16); + static const size_t MAX_BRANCHING_FACTOR = 16; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth + + typedef BVHNodeRecordMB4D NodeRecordMB4D; + typedef BinSplit Split; + typedef mvector* PrimRefVector; + typedef SharedVector> SharedPrimRefVector; + typedef LocalChildListT LocalChildList; + typedef LocalChildListT LocalChildListSplit; + + public: + + BuilderT (MemoryMonitorInterface* device, + const RecalculatePrimRef recalculatePrimRef, + const CreateAllocFunc createAlloc, + const CreateNodeFunc createNode, + const SetNodeFunc setNode, + const CreateLeafFunc createLeaf, + const ProgressMonitor progressMonitor, + const Settings& settings) + : cfg(settings), + heuristicObjectSplit(), + heuristicTemporalSplit(device, recalculatePrimRef), + recalculatePrimRef(recalculatePrimRef), createAlloc(createAlloc), createNode(createNode), setNode(setNode), createLeaf(createLeaf), + progressMonitor(progressMonitor) + { + if (cfg.branchingFactor > MAX_BRANCHING_FACTOR) + throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large"); + } + + /*! finds the best split */ + const Split find(const SetMB& set) + { + /* first try standard object split */ + const Split object_split = heuristicObjectSplit.find(set,cfg.logBlockSize); + const float object_split_sah = object_split.splitSAH(); + + /* test temporal splits only when object split was bad */ + const float leaf_sah = set.leafSAH(cfg.logBlockSize); + if (object_split_sah < 0.50f*leaf_sah) + return object_split; + + /* do temporal splits only if the the time range is big enough */ + if (set.time_range.size() > 1.01f/float(set.max_num_time_segments)) + { + const Split temporal_split = heuristicTemporalSplit.find(set,cfg.logBlockSize); + const float temporal_split_sah = temporal_split.splitSAH(); + + /* take temporal split if it improved SAH */ + if (temporal_split_sah < object_split_sah) + return temporal_split; + } + + return object_split; + } + + /*! array partitioning */ + __forceinline std::unique_ptr> split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset) + { + /* perform object split */ + if (likely(split.data == Split::SPLIT_OBJECT)) { + heuristicObjectSplit.split(split,set,lset,rset); + } + /* perform temporal split */ + else if (likely(split.data == Split::SPLIT_TEMPORAL)) { + return heuristicTemporalSplit.split(split,set,lset,rset); + } + /* perform fallback split */ + else if (unlikely(split.data == Split::SPLIT_FALLBACK)) { + set.deterministic_order(); + splitFallback(set,lset,rset); + } + /* split by geometry */ + else if (unlikely(split.data == Split::SPLIT_GEOMID)) { + set.deterministic_order(); + splitByGeometry(set,lset,rset); + } + else + assert(false); + + return std::unique_ptr>(); + } + + /*! finds the best fallback split */ + __noinline Split findFallback(const SetMB& set) + { + /* split if primitives are not from same geometry */ + if (!sameGeometry(set)) + return Split(0.0f,Split::SPLIT_GEOMID); + + /* if a leaf can only hold a single time-segment, we might have to do additional temporal splits */ + if (cfg.singleLeafTimeSegment) + { + /* test if one primitive has more than one time segment in time range, if so split time */ + for (size_t i=set.begin(); i itime_range = prim.timeSegmentRange(set.time_range); + const int localTimeSegments = itime_range.size(); + assert(localTimeSegments > 0); + if (localTimeSegments > 1) { + const int icenter = (itime_range.begin() + itime_range.end())/2; + const float splitTime = prim.timeStep(icenter); + return Split(0.0f,(unsigned)Split::SPLIT_TEMPORAL,0,splitTime); + } + } + } + + /* otherwise return fallback split */ + return Split(0.0f,Split::SPLIT_FALLBACK); + } + + /*! performs fallback split */ + void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset) + { + mvector& prims = *set.prims; + + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + PrimInfoMB linfo = empty; + for (size_t i=begin; i(begin,center),set.time_range); + new (&rset) SetMB(rinfo,set.prims,range(center,end ),set.time_range); + } + + /*! checks if all primitives are from the same geometry */ + __forceinline bool sameGeometry(const SetMB& set) + { + if (set.size() == 0) return true; + mvector& prims = *set.prims; + const size_t begin = set.begin(); + const size_t end = set.end(); + unsigned int firstGeomID = prims[begin].geomID(); + for (size_t i=begin+1; i 1); + + mvector& prims = *set.prims; + const size_t begin = set.begin(); + const size_t end = set.end(); + + PrimInfoMB left(empty); + PrimInfoMB right(empty); + unsigned int geomID = prims[begin].geomID(); + size_t center = serial_partitioning(prims.data(),begin,end,left,right, + [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; }, + [ ] ( PrimInfoMB& dst, const PrimRefMB& prim ) { dst.add_primref(prim); }); + + new (&lset) SetMB(left, set.prims,range(begin,center),set.time_range); + new (&rset) SetMB(right,set.prims,range(center,end ),set.time_range); + } + + const NodeRecordMB4D createLargeLeaf(const BuildRecord& in, Allocator alloc) + { + /* this should never occur but is a fatal error */ + if (in.depth > cfg.maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* replace already found split by fallback split */ + const BuildRecordSplit current(BuildRecord(in.prims,in.depth),findFallback(in.prims)); + + /* special case when directly creating leaf without any splits that could shrink time_range */ + bool force_split = false; + if (current.depth == 1 && current.size() > 0) + { + BBox1f c = empty; + BBox1f p = current.prims.time_range; + for (size_t i=current.prims.begin(); i& prims = *current.prims.prims; + c.extend(prims[i].time_range); + } + + force_split = c.lower > p.lower || c.upper < p.upper; + } + + /* create leaf for few primitives */ + if (current.size() <= cfg.maxLeafSize && current.split.data < Split::SPLIT_ENFORCE && !force_split) + return createLeaf(current,alloc); + + /* fill all children by always splitting the largest one */ + bool hasTimeSplits = false; + NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; + LocalChildListSplit children(current); + + do { + /* find best child with largest bounding box area */ + size_t bestChild = -1; + size_t bestSize = 0; + for (size_t i=0; i bestSize) { + bestSize = children[i].size(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /* perform best found split */ + BuildRecordSplit& brecord = children[bestChild]; + BuildRecordSplit lrecord(current.depth+1); + BuildRecordSplit rrecord(current.depth+1); + std::unique_ptr> new_vector = split(brecord.split,brecord.prims,lrecord.prims,rrecord.prims); + hasTimeSplits |= new_vector != nullptr; + + /* find new splits */ + lrecord.split = findFallback(lrecord.prims); + rrecord.split = findFallback(rrecord.prims); + children.split(bestChild,lrecord,rrecord,std::move(new_vector)); + + } while (children.size() < cfg.branchingFactor); + + /* detect time_ranges that have shrunken */ + for (size_t i=0; i p.lower || c.upper < p.upper; + } + + /* create node */ + auto node = createNode(children.children.data(),children.numChildren,alloc,hasTimeSplits); + + /* recurse into each child and perform reduction */ + LBBox3fa gbounds = empty; + for (size_t i=0; i= 0) && (splitSAH >= 0))); + + /*! create a leaf node when threshold reached or SAH tells us to stop */ + if (current.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) { + current.prims.deterministic_order(); + return createLargeLeaf(current,alloc); + } + + /*! perform initial split */ + SetMB lprims,rprims; + std::unique_ptr> new_vector = split(csplit,current.prims,lprims,rprims); + bool hasTimeSplits = new_vector != nullptr; + NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; + LocalChildList children(current); + { + BuildRecord lrecord(lprims,current.depth+1); + BuildRecord rrecord(rprims,current.depth+1); + children.split(0,lrecord,rrecord,std::move(new_vector)); + } + + /*! split until node is full or SAH tells us to stop */ + while (children.size() < cfg.branchingFactor) + { + /*! find best child to split */ + float bestArea = neg_inf; + ssize_t bestChild = -1; + for (size_t i=0; i bestArea) { + bestChild = i; bestArea = expectedApproxHalfArea(children[i].prims.geomBounds); + } + } + if (bestChild == -1) break; + + /* perform split */ + BuildRecord& brecord = children[bestChild]; + BuildRecord lrecord(current.depth+1); + BuildRecord rrecord(current.depth+1); + Split csplit = find(brecord.prims); + std::unique_ptr> new_vector = split(csplit,brecord.prims,lrecord.prims,rrecord.prims); + hasTimeSplits |= new_vector != nullptr; + children.split(bestChild,lrecord,rrecord,std::move(new_vector)); + } + + /* detect time_ranges that have shrunken */ + for (size_t i=0; i p.lower || c.upper < p.upper; + } + + /* sort buildrecords for simpler shadow ray traversal */ + //std::sort(&children[0],&children[children.size()],std::greater()); // FIXME: reduces traversal performance of bvh8.triangle4 (need to verified) !! + + /*! create an inner node */ + auto node = createNode(children.children.data(), children.numChildren, alloc, hasTimeSplits); + LBBox3fa gbounds = empty; + + /* spawn tasks */ + if (unlikely(current.size() > cfg.singleThreadThreshold)) + { + /*! parallel_for is faster than spawing sub-tasks */ + parallel_for(size_t(0), children.size(), [&] (const range& r) { + for (size_t i=r.begin(); i=0; i--) { + values[i] = recurse(children[i],alloc,false); + gbounds.extend(values[i].lbounds); + } + } + + setNode(current,children.children.data(),node,values,children.numChildren); + + /* calculate geometry bounds of this node */ + if (unlikely(hasTimeSplits)) + return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range); + else + return NodeRecordMB4D(node,gbounds,current.prims.time_range); + } + + /*! builder entry function */ + __forceinline const NodeRecordMB4D operator() (mvector& prims, const PrimInfoMB& pinfo) + { + const SetMB set(pinfo,&prims); + auto ret = recurse(BuildRecord(set,1),nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + return ret; + } + + private: + Settings cfg; + HeuristicArrayBinningMB heuristicObjectSplit; + HeuristicMBlurTemporalSplit heuristicTemporalSplit; + const RecalculatePrimRef recalculatePrimRef; + const CreateAllocFunc createAlloc; + const CreateNodeFunc createNode; + const SetNodeFunc setNode; + const CreateLeafFunc createLeaf; + const ProgressMonitor progressMonitor; + }; + + template + + static const BVHNodeRecordMB4D build(mvector& prims, + const PrimInfoMB& pinfo, + MemoryMonitorInterface* device, + const RecalculatePrimRef recalculatePrimRef, + const CreateAllocFunc createAlloc, + const CreateNodeFunc createNode, + const SetNodeFunc setNode, + const CreateLeafFunc createLeaf, + const ProgressMonitorFunc progressMonitor, + const Settings& settings) + { + typedef BuilderT< + NodeRef, + RecalculatePrimRef, + decltype(createAlloc()), + CreateAllocFunc, + CreateNodeFunc, + SetNodeFunc, + CreateLeafFunc, + ProgressMonitorFunc> Builder; + + Builder builder(device, + recalculatePrimRef, + createAlloc, + createNode, + setNode, + createLeaf, + progressMonitor, + settings); + + + return builder(prims,pinfo); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h new file mode 100644 index 00000000000..e477c313a38 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h @@ -0,0 +1,526 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../bvh/bvh.h" +#include "../geometry/primitive.h" +#include "../builders/bvh_builder_msmblur.h" +#include "../builders/heuristic_binning_array_aligned.h" +#include "../builders/heuristic_binning_array_unaligned.h" +#include "../builders/heuristic_timesplit_array.h" + +namespace embree +{ + namespace isa + { + struct BVHBuilderHairMSMBlur + { + /*! settings for msmblur builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8) {} + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t logBlockSize; //!< log2 of blocksize for SAH heuristic + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + }; + + struct BuildRecord + { + public: + __forceinline BuildRecord () {} + + __forceinline BuildRecord (size_t depth) + : depth(depth) {} + + __forceinline BuildRecord (const SetMB& prims, size_t depth) + : depth(depth), prims(prims) {} + + __forceinline size_t size() const { + return prims.size(); + } + + public: + size_t depth; //!< depth of the root of this subtree + SetMB prims; //!< the list of primitives + }; + + template + + class BuilderT + { + ALIGNED_CLASS_(16); + + static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth + static const size_t SINGLE_THREADED_THRESHOLD = 4096; //!< threshold to switch to single threaded build + + typedef BVHNodeRecordMB NodeRecordMB; + typedef BVHNodeRecordMB4D NodeRecordMB4D; + + typedef FastAllocator::CachedAllocator Allocator; + typedef LocalChildListT LocalChildList; + + typedef HeuristicMBlurTemporalSplit HeuristicTemporal; + typedef HeuristicArrayBinningMB HeuristicBinning; + typedef UnalignedHeuristicArrayBinningMB UnalignedHeuristicBinning; + + public: + + BuilderT (Scene* scene, + const RecalculatePrimRef& recalculatePrimRef, + const CreateAllocFunc& createAlloc, + const CreateAABBNodeMBFunc& createAABBNodeMB, + const SetAABBNodeMBFunc& setAABBNodeMB, + const CreateOBBNodeMBFunc& createOBBNodeMB, + const SetOBBNodeMBFunc& setOBBNodeMB, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const Settings settings) + + : cfg(settings), + scene(scene), + recalculatePrimRef(recalculatePrimRef), + createAlloc(createAlloc), + createAABBNodeMB(createAABBNodeMB), setAABBNodeMB(setAABBNodeMB), + createOBBNodeMB(createOBBNodeMB), setOBBNodeMB(setOBBNodeMB), + createLeaf(createLeaf), + progressMonitor(progressMonitor), + unalignedHeuristic(scene), + temporalSplitHeuristic(scene->device,recalculatePrimRef) {} + + private: + + /*! checks if all primitives are from the same geometry */ + __forceinline bool sameGeometry(const SetMB& set) + { + mvector& prims = *set.prims; + unsigned int firstGeomID = prims[set.begin()].geomID(); + for (size_t i=set.begin()+1; i& prims = *set.prims; + + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + PrimInfoMB linfo = empty; + for (size_t i=begin; i(begin,center),set.time_range); + new (&rset) SetMB(rinfo,set.prims,range(center,end ),set.time_range); + } + + void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset) + { + assert(set.size() > 1); + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfoMB linfo(empty); + PrimInfoMB rinfo(empty); + unsigned int geomID = (*set.prims)[begin].geomID(); + size_t center = serial_partitioning(set.prims->data(),begin,end,linfo,rinfo, + [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; }, + [ ] ( PrimInfoMB& a, const PrimRefMB& ref ) { a.add_primref(ref); }); + + new (&lset) SetMB(linfo,set.prims,range(begin,center),set.time_range); + new (&rset) SetMB(rinfo,set.prims,range(center,end ),set.time_range); + } + + /*! creates a large leaf that could be larger than supported by the BVH */ + NodeRecordMB4D createLargeLeaf(BuildRecord& current, Allocator alloc) + { + /* this should never occur but is a fatal error */ + if (current.depth > cfg.maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* special case when directly creating leaf without any splits that could shrink time_range */ + bool force_split = false; + if (current.depth == 1 && current.size() > 0) + { + BBox1f c = empty; + BBox1f p = current.prims.time_range; + for (size_t i=current.prims.begin(); i& prims = *current.prims.prims; + c.extend(prims[i].time_range); + } + + force_split = c.lower > p.lower || c.upper < p.upper; + } + + /* create leaf for few primitives */ + if (current.size() <= cfg.maxLeafSize && sameGeometry(current.prims) && !force_split) + return createLeaf(current.prims,alloc); + + /* fill all children by always splitting the largest one */ + LocalChildList children(current); + NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; + + do { + + /* find best child with largest bounding box area */ + int bestChild = -1; + size_t bestSize = 0; + for (unsigned i=0; i bestSize) { + bestSize = children[i].size(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + BuildRecord left(current.depth+1); + BuildRecord right(current.depth+1); + if (!sameGeometry(children[bestChild].prims)) { + splitByGeometry(children[bestChild].prims,left.prims,right.prims); + } else { + splitFallback(children[bestChild].prims,left.prims,right.prims); + } + children.split(bestChild,left,right,std::unique_ptr>()); + + } while (children.size() < cfg.branchingFactor); + + + /* detect time_ranges that have shrunken */ + bool timesplit = false; + for (size_t i=0; i p.lower || c.upper < p.upper; + } + + /* create node */ + NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,timesplit); + + LBBox3fa bounds = empty; + for (size_t i=0; i> split(const BuildRecord& current, BuildRecord& lrecord, BuildRecord& rrecord, bool& aligned, bool& timesplit) + { + /* variable to track the SAH of the best splitting approach */ + float bestSAH = inf; + const float leafSAH = current.prims.leafSAH(cfg.logBlockSize); + + /* perform standard binning in aligned space */ + HeuristicBinning::Split alignedObjectSplit = alignedHeuristic.find(current.prims,cfg.logBlockSize); + float alignedObjectSAH = alignedObjectSplit.splitSAH(); + bestSAH = min(alignedObjectSAH,bestSAH); + + /* perform standard binning in unaligned space */ + UnalignedHeuristicBinning::Split unalignedObjectSplit; + LinearSpace3fa uspace; + float unalignedObjectSAH = inf; + if (alignedObjectSAH > 0.7f*leafSAH) { + uspace = unalignedHeuristic.computeAlignedSpaceMB(scene,current.prims); + const SetMB sset = current.prims.primInfo(recalculatePrimRef,uspace); + unalignedObjectSplit = unalignedHeuristic.find(sset,cfg.logBlockSize,uspace); + unalignedObjectSAH = 1.3f*unalignedObjectSplit.splitSAH(); // makes unaligned splits more expensive + bestSAH = min(unalignedObjectSAH,bestSAH); + } + + /* do temporal splits only if previous approaches failed to produce good SAH and the the time range is large enough */ + float temporal_split_sah = inf; + typename HeuristicTemporal::Split temporal_split; + if (bestSAH > 0.5f*leafSAH) { + if (current.prims.time_range.size() > 1.01f/float(current.prims.max_num_time_segments)) { + temporal_split = temporalSplitHeuristic.find(current.prims,cfg.logBlockSize); + temporal_split_sah = temporal_split.splitSAH(); + bestSAH = min(temporal_split_sah,bestSAH); + } + } + + /* perform fallback split if SAH heuristics failed */ + if (unlikely(!std::isfinite(bestSAH))) { + current.prims.deterministic_order(); + splitFallback(current.prims,lrecord.prims,rrecord.prims); + } + /* perform aligned split if this is best */ + else if (likely(bestSAH == alignedObjectSAH)) { + alignedHeuristic.split(alignedObjectSplit,current.prims,lrecord.prims,rrecord.prims); + } + /* perform unaligned split if this is best */ + else if (likely(bestSAH == unalignedObjectSAH)) { + unalignedHeuristic.split(unalignedObjectSplit,uspace,current.prims,lrecord.prims,rrecord.prims); + aligned = false; + } + /* perform temporal split if this is best */ + else if (likely(bestSAH == temporal_split_sah)) { + timesplit = true; + return temporalSplitHeuristic.split(temporal_split,current.prims,lrecord.prims,rrecord.prims); + } + else + assert(false); + + return std::unique_ptr>(); + } + + /*! recursive build */ + NodeRecordMB4D recurse(BuildRecord& current, Allocator alloc, bool toplevel) + { + /* get thread local allocator */ + if (!alloc) + alloc = createAlloc(); + + /* call memory monitor function to signal progress */ + if (toplevel && current.size() <= SINGLE_THREADED_THRESHOLD) + progressMonitor(current.size()); + + /* create leaf node */ + if (current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || current.size() <= cfg.minLeafSize) { + current.prims.deterministic_order(); + return createLargeLeaf(current,alloc); + } + + /* fill all children by always splitting the one with the largest surface area */ + NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; + LocalChildList children(current); + bool aligned = true; + bool timesplit = false; + + do { + + /* find best child with largest bounding box area */ + ssize_t bestChild = -1; + float bestArea = neg_inf; + for (size_t i=0; i bestArea) { + bestArea = children[i].prims.halfArea(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + BuildRecord left(current.depth+1); + BuildRecord right(current.depth+1); + std::unique_ptr> new_vector = split(children[bestChild],left,right,aligned,timesplit); + children.split(bestChild,left,right,std::move(new_vector)); + + } while (children.size() < cfg.branchingFactor); + + /* detect time_ranges that have shrunken */ + for (size_t i=0; i p.lower || c.upper < p.upper; + } + + /* create time split node */ + if (timesplit) + { + const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true); + + /* spawn tasks or ... */ + if (current.size() > SINGLE_THREADED_THRESHOLD) + { + parallel_for(size_t(0), children.size(), [&] (const range& r) { + for (size_t i=r.begin(); i SINGLE_THREADED_THRESHOLD) + { + LBBox3fa cbounds[MAX_BRANCHING_FACTOR]; + parallel_for(size_t(0), children.size(), [&] (const range& r) { + for (size_t i=r.begin(); i SINGLE_THREADED_THRESHOLD) + { + parallel_for(size_t(0), children.size(), [&] (const range& r) { + for (size_t i=r.begin(); i& prims, const PrimInfoMB& pinfo) + { + BuildRecord record(SetMB(pinfo,&prims),1); + auto root = recurse(record,nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + + private: + Settings cfg; + Scene* scene; + const RecalculatePrimRef& recalculatePrimRef; + const CreateAllocFunc& createAlloc; + const CreateAABBNodeMBFunc& createAABBNodeMB; + const SetAABBNodeMBFunc& setAABBNodeMB; + const CreateOBBNodeMBFunc& createOBBNodeMB; + const SetOBBNodeMBFunc& setOBBNodeMB; + const CreateLeafFunc& createLeaf; + const ProgressMonitor& progressMonitor; + + private: + HeuristicBinning alignedHeuristic; + UnalignedHeuristicBinning unalignedHeuristic; + HeuristicTemporal temporalSplitHeuristic; + }; + + template + + static BVHNodeRecordMB4D build (Scene* scene, mvector& prims, const PrimInfoMB& pinfo, + const RecalculatePrimRef& recalculatePrimRef, + const CreateAllocFunc& createAlloc, + const CreateAABBNodeMBFunc& createAABBNodeMB, + const SetAABBNodeMBFunc& setAABBNodeMB, + const CreateOBBNodeMBFunc& createOBBNodeMB, + const SetOBBNodeMBFunc& setOBBNodeMB, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const Settings settings) + { + typedef BuilderT Builder; + + Builder builder(scene,recalculatePrimRef,createAlloc, + createAABBNodeMB,setAABBNodeMB, + createOBBNodeMB,setOBBNodeMB, + createLeaf,progressMonitor,settings); + + return builder(prims,pinfo); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h new file mode 100644 index 00000000000..3f7e678a101 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h @@ -0,0 +1,669 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "heuristic_binning_array_aligned.h" +#include "heuristic_spatial_array.h" +#include "heuristic_openmerge_array.h" + +#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL +# define NUM_OBJECT_BINS 16 +# define NUM_SPATIAL_BINS 16 +#else +# define NUM_OBJECT_BINS 32 +# define NUM_SPATIAL_BINS 16 +#endif + +namespace embree +{ + namespace isa + { + MAYBE_UNUSED static const float travCost = 1.0f; + MAYBE_UNUSED static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024; + + struct GeneralBVHBuilder + { + static const size_t MAX_BRANCHING_FACTOR = 16; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth + + + /*! settings for SAH builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), + travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf) {} + + /*! initialize settings from API settings */ + Settings (const RTCBuildArguments& settings) + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), + travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf) + { + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor; + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth; + if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize )) logBlockSize = bsr(static_cast(settings.sahBlockSize)); + if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize; + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize; + if (RTC_BUILD_ARGUMENTS_HAS(settings,traversalCost )) travCost = settings.traversalCost; + if (RTC_BUILD_ARGUMENTS_HAS(settings,intersectionCost )) intCost = settings.intersectionCost; + + minLeafSize = min(minLeafSize,maxLeafSize); + } + + Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold, size_t primrefarrayalloc = inf) + : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), + travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold), primrefarrayalloc(primrefarrayalloc) + { + minLeafSize = min(minLeafSize,maxLeafSize); + } + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t logBlockSize; //!< log2 of blocksize for SAH heuristic + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + float travCost; //!< estimated cost of one traversal step + float intCost; //!< estimated cost of one primitive intersection + size_t singleThreadThreshold; //!< threshold when we switch to single threaded build + size_t primrefarrayalloc; //!< builder uses prim ref array to allocate nodes and leaves when a subtree of that size is finished + }; + + /*! recursive state of builder */ + template + struct BuildRecordT + { + public: + __forceinline BuildRecordT () {} + + __forceinline BuildRecordT (size_t depth) + : depth(depth), alloc_barrier(false), prims(empty) {} + + __forceinline BuildRecordT (size_t depth, const Set& prims) + : depth(depth), alloc_barrier(false), prims(prims) {} + + __forceinline BBox3fa bounds() const { return prims.geomBounds; } + + __forceinline friend bool operator< (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() < b.prims.size(); } + __forceinline friend bool operator> (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() > b.prims.size(); } + + __forceinline size_t size() const { return prims.size(); } + + public: + size_t depth; //!< Depth of the root of this subtree. + bool alloc_barrier; //!< barrier used to reuse primref-array blocks to allocate nodes + Set prims; //!< The list of primitives. + }; + + template + struct DefaultCanCreateLeafFunc + { + __forceinline bool operator()(const PrimRef*, const Set&) const { return true; } + }; + + template + struct DefaultCanCreateLeafSplitFunc + { + __forceinline void operator()(PrimRef*, const Set&, Set&, Set&) const { } + }; + + template + + class BuilderT + { + friend struct GeneralBVHBuilder; + + BuilderT (PrimRef* prims, + Heuristic& heuristic, + const CreateAllocFunc& createAlloc, + const CreateNodeFunc& createNode, + const UpdateNodeFunc& updateNode, + const CreateLeafFunc& createLeaf, + const CanCreateLeafFunc& canCreateLeaf, + const CanCreateLeafSplitFunc& canCreateLeafSplit, + const ProgressMonitor& progressMonitor, + const Settings& settings) : + cfg(settings), + prims(prims), + heuristic(heuristic), + createAlloc(createAlloc), + createNode(createNode), + updateNode(updateNode), + createLeaf(createLeaf), + canCreateLeaf(canCreateLeaf), + canCreateLeafSplit(canCreateLeafSplit), + progressMonitor(progressMonitor) + { + if (cfg.branchingFactor > MAX_BRANCHING_FACTOR) + throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large"); + } + + const ReductionTy createLargeLeaf(const BuildRecord& current, Allocator alloc) + { + /* this should never occur but is a fatal error */ + if (current.depth > cfg.maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* create leaf for few primitives */ + if (current.prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,current.prims)) + return createLeaf(prims,current.prims,alloc); + + /* fill all children by always splitting the largest one */ + ReductionTy values[MAX_BRANCHING_FACTOR]; + BuildRecord children[MAX_BRANCHING_FACTOR]; + size_t numChildren = 1; + children[0] = current; + do { + + /* find best child with largest bounding box area */ + size_t bestChild = -1; + size_t bestSize = 0; + for (size_t i=0; i bestSize) { + bestSize = children[i].prims.size(); + bestChild = i; + } + } + if (bestChild == (size_t)-1) break; + + /*! split best child into left and right child */ + BuildRecord left(current.depth+1); + BuildRecord right(current.depth+1); + if (!canCreateLeaf(prims,children[bestChild].prims)) { + canCreateLeafSplit(prims,children[bestChild].prims,left.prims,right.prims); + } else { + heuristic.splitFallback(children[bestChild].prims,left.prims,right.prims); + } + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = left; + children[numChildren+0] = right; + numChildren++; + + } while (numChildren < cfg.branchingFactor); + + /* set barrier for primrefarrayalloc */ + if (unlikely(current.size() > cfg.primrefarrayalloc)) + for (size_t i=0; i= 0) && (splitSAH >= 0))); + + /*! create a leaf node when threshold reached or SAH tells us to stop */ + if (current.prims.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.prims.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) { + heuristic.deterministic_order(current.prims); + return createLargeLeaf(current,alloc); + } + + /*! perform initial split */ + Set lprims,rprims; + heuristic.split(split,current.prims,lprims,rprims); + + /*! initialize child list with initial split */ + ReductionTy values[MAX_BRANCHING_FACTOR]; + BuildRecord children[MAX_BRANCHING_FACTOR]; + children[0] = BuildRecord(current.depth+1,lprims); + children[1] = BuildRecord(current.depth+1,rprims); + size_t numChildren = 2; + + /*! split until node is full or SAH tells us to stop */ + while (numChildren < cfg.branchingFactor) + { + /*! find best child to split */ + float bestArea = neg_inf; + ssize_t bestChild = -1; + for (size_t i=0; i bestArea) { + bestChild = i; + bestArea = halfArea(children[i].prims.geomBounds); + } + } + if (bestChild == -1) break; + + /* perform best found split */ + BuildRecord& brecord = children[bestChild]; + BuildRecord lrecord(current.depth+1); + BuildRecord rrecord(current.depth+1); + auto split = heuristic.find(brecord.prims,cfg.logBlockSize); + heuristic.split(split,brecord.prims,lrecord.prims,rrecord.prims); + children[bestChild ] = lrecord; + children[numChildren] = rrecord; + numChildren++; + } + + /* set barrier for primrefarrayalloc */ + if (unlikely(current.size() > cfg.primrefarrayalloc)) + for (size_t i=0; i()); + + /*! create an inner node */ + auto node = createNode(children,numChildren,alloc); + + /* spawn tasks */ + if (current.size() > cfg.singleThreadThreshold) + { + /*! parallel_for is faster than spawing sub-tasks */ + parallel_for(size_t(0), numChildren, [&] (const range& r) { // FIXME: no range here + for (size_t i=r.begin(); i + + __noinline static ReductionTy build(Heuristic& heuristic, + PrimRef* prims, + const Set& set, + CreateAllocFunc createAlloc, + CreateNodeFunc createNode, UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const Settings& settings) + { + typedef BuildRecordT BuildRecord; + + typedef BuilderT< + BuildRecord, + Heuristic, + Set, + PrimRef, + ReductionTy, + decltype(createAlloc()), + CreateAllocFunc, + CreateNodeFunc, + UpdateNodeFunc, + CreateLeafFunc, + DefaultCanCreateLeafFunc, + DefaultCanCreateLeafSplitFunc, + ProgressMonitor> Builder; + + /* instantiate builder */ + Builder builder(prims, + heuristic, + createAlloc, + createNode, + updateNode, + createLeaf, + DefaultCanCreateLeafFunc(), + DefaultCanCreateLeafSplitFunc(), + progressMonitor, + settings); + + /* build hierarchy */ + BuildRecord record(1,set); + const ReductionTy root = builder.recurse(record,nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + + template< + typename ReductionTy, + typename Heuristic, + typename Set, + typename PrimRef, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename CanCreateLeafFunc, + typename CanCreateLeafSplitFunc, + typename ProgressMonitor> + + __noinline static ReductionTy build(Heuristic& heuristic, + PrimRef* prims, + const Set& set, + CreateAllocFunc createAlloc, + CreateNodeFunc createNode, UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + const CanCreateLeafFunc& canCreateLeaf, + const CanCreateLeafSplitFunc& canCreateLeafSplit, + const ProgressMonitor& progressMonitor, + const Settings& settings) + { + typedef BuildRecordT BuildRecord; + + typedef BuilderT< + BuildRecord, + Heuristic, + Set, + PrimRef, + ReductionTy, + decltype(createAlloc()), + CreateAllocFunc, + CreateNodeFunc, + UpdateNodeFunc, + CreateLeafFunc, + CanCreateLeafFunc, + CanCreateLeafSplitFunc, + ProgressMonitor> Builder; + + /* instantiate builder */ + Builder builder(prims, + heuristic, + createAlloc, + createNode, + updateNode, + createLeaf, + canCreateLeaf, + canCreateLeafSplit, + progressMonitor, + settings); + + /* build hierarchy */ + BuildRecord record(1,set); + const ReductionTy root = builder.recurse(record,nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + }; + + /* SAH builder that operates on an array of BuildRecords */ + struct BVHBuilderBinnedSAH + { + typedef PrimInfoRange Set; + typedef HeuristicArrayBinningSAH Heuristic; + typedef GeneralBVHBuilder::BuildRecordT BuildRecord; + typedef GeneralBVHBuilder::Settings Settings; + + /*! special builder that propagates reduction over the tree */ + template< + typename ReductionTy, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename ProgressMonitor> + + static ReductionTy build(CreateAllocFunc createAlloc, + CreateNodeFunc createNode, UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + PrimRef* prims, const PrimInfo& pinfo, + const Settings& settings) + { + Heuristic heuristic(prims); + return GeneralBVHBuilder::build( + heuristic, + prims, + PrimInfoRange(0,pinfo.size(),pinfo), + createAlloc, + createNode, + updateNode, + createLeaf, + progressMonitor, + settings); + } + + /*! special builder that propagates reduction over the tree */ + template< + typename ReductionTy, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename CanCreateLeafFunc, + typename CanCreateLeafSplitFunc, + typename ProgressMonitor> + + static ReductionTy build(CreateAllocFunc createAlloc, + CreateNodeFunc createNode, UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + const CanCreateLeafFunc& canCreateLeaf, + const CanCreateLeafSplitFunc& canCreateLeafSplit, + const ProgressMonitor& progressMonitor, + PrimRef* prims, const PrimInfo& pinfo, + const Settings& settings) + { + Heuristic heuristic(prims); + return GeneralBVHBuilder::build( + heuristic, + prims, + PrimInfoRange(0,pinfo.size(),pinfo), + createAlloc, + createNode, + updateNode, + createLeaf, + canCreateLeaf, + canCreateLeafSplit, + progressMonitor, + settings); + } + }; + + /* Spatial SAH builder that operates on an double-buffered array of BuildRecords */ + struct BVHBuilderBinnedFastSpatialSAH + { + typedef PrimInfoExtRange Set; + typedef Split2,SpatialBinSplit > Split; + typedef GeneralBVHBuilder::BuildRecordT BuildRecord; + typedef GeneralBVHBuilder::Settings Settings; + + static const unsigned int GEOMID_MASK = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; + static const unsigned int SPLITS_MASK = 0xFFFFFFFF << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); + + template + struct CreateLeafExt + { + __forceinline CreateLeafExt (const UserCreateLeaf userCreateLeaf) + : userCreateLeaf(userCreateLeaf) {} + + // __noinline is workaround for ICC2016 compiler bug + template + __noinline ReductionTy operator() (PrimRef* prims, const range& range, Allocator alloc) const + { + for (size_t i=range.begin(); i + + static ReductionTy build(CreateAllocFunc createAlloc, + CreateNodeFunc createNode, + UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + SplitPrimitiveFunc splitPrimitive, + ProgressMonitor progressMonitor, + PrimRef* prims, + const size_t extSize, + const PrimInfo& pinfo, + const Settings& settings) + { + typedef HeuristicArraySpatialSAH Heuristic; + Heuristic heuristic(splitPrimitive,prims,pinfo); + + /* calculate total surface area */ // FIXME: this sum is not deterministic + const float A = (float) parallel_reduce(size_t(0),pinfo.size(),0.0, [&] (const range& r) -> double { + + double A = 0.0f; + for (size_t i=r.begin(); i()); + + + /* calculate maximum number of spatial splits per primitive */ + const unsigned int maxSplits = ((size_t)1 << RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)-1; + const float f = 10.0f; + + const float invA = 1.0f / A; + parallel_for( size_t(0), pinfo.size(), [&](const range& r) { + + for (size_t i=r.begin(); i( + heuristic, + prims, + PrimInfoExtRange(0,pinfo.size(),extSize,pinfo), + createAlloc, + createNode, + updateNode, + CreateLeafExt(createLeaf), + progressMonitor, + settings); + } + }; + + /* Open/Merge SAH builder that operates on an array of BuildRecords */ + struct BVHBuilderBinnedOpenMergeSAH + { + static const size_t NUM_OBJECT_BINS_HQ = 32; + typedef PrimInfoExtRange Set; + typedef BinSplit Split; + typedef GeneralBVHBuilder::BuildRecordT BuildRecord; + typedef GeneralBVHBuilder::Settings Settings; + + /*! special builder that propagates reduction over the tree */ + template< + typename ReductionTy, + typename BuildRef, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename NodeOpenerFunc, + typename ProgressMonitor> + + static ReductionTy build(CreateAllocFunc createAlloc, + CreateNodeFunc createNode, + UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + NodeOpenerFunc nodeOpenerFunc, + ProgressMonitor progressMonitor, + BuildRef* prims, + const size_t extSize, + const PrimInfo& pinfo, + const Settings& settings) + { + typedef HeuristicArrayOpenMergeSAH Heuristic; + Heuristic heuristic(nodeOpenerFunc,prims,settings.branchingFactor); + + return GeneralBVHBuilder::build( + heuristic, + prims, + PrimInfoExtRange(0,pinfo.size(),extSize,pinfo), + createAlloc, + createNode, + updateNode, + createLeaf, + progressMonitor, + settings); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h new file mode 100644 index 00000000000..a4d3b68e46a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h @@ -0,0 +1,972 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "priminfo.h" +#include "../../common/algorithms/parallel_reduce.h" +#include "../../common/algorithms/parallel_partition.h" + +namespace embree +{ + namespace isa + { + /*! mapping into bins */ + template + struct BinMapping + { + public: + __forceinline BinMapping() {} + + /*! calculates the mapping */ + __forceinline BinMapping(size_t N, const BBox3fa& centBounds) + { + num = min(BINS,size_t(4.0f + 0.05f*N)); + assert(num >= 1); + const vfloat4 eps = 1E-34f; + const vfloat4 diag = max(eps, (vfloat4) centBounds.size()); + scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); + ofs = (vfloat4) centBounds.lower; + } + + /*! calculates the mapping */ + __forceinline BinMapping(const BBox3fa& centBounds) + { + num = BINS; + const vfloat4 eps = 1E-34f; + const vfloat4 diag = max(eps, (vfloat4) centBounds.size()); + scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); + ofs = (vfloat4) centBounds.lower; + } + + /*! calculates the mapping */ + template + __forceinline BinMapping(const PrimInfo& pinfo) + { + const vfloat4 eps = 1E-34f; + num = min(BINS,size_t(4.0f + 0.05f*pinfo.size())); + const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size()); + scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); + ofs = (vfloat4) pinfo.centBounds.lower; + } + + /*! returns number of bins */ + __forceinline size_t size() const { return num; } + + /*! slower but safe binning */ + __forceinline Vec3ia bin(const Vec3fa& p) const + { + const vint4 i = floori((vfloat4(p)-ofs)*scale); +#if 1 + assert(i[0] >= 0 && (size_t)i[0] < num); + assert(i[1] >= 0 && (size_t)i[1] < num); + assert(i[2] >= 0 && (size_t)i[2] < num); + return Vec3ia(i); +#else + return Vec3ia(clamp(i,vint4(0),vint4(num-1))); +#endif + } + + /*! faster but unsafe binning */ + __forceinline Vec3ia bin_unsafe(const Vec3fa& p) const { + return Vec3ia(floori((vfloat4(p)-ofs)*scale)); + } + + /*! faster but unsafe binning */ + template + __forceinline Vec3ia bin_unsafe(const PrimRef& p) const { + return bin_unsafe(p.binCenter()); + } + + /*! faster but unsafe binning */ + template + __forceinline Vec3ia bin_unsafe(const PrimRef& p, const BinBoundsAndCenter& binBoundsAndCenter) const { + return bin_unsafe(binBoundsAndCenter.binCenter(p)); + } + + template + __forceinline bool bin_unsafe(const PrimRef& ref, + const vint4& vSplitPos, + const vbool4& splitDimMask) const // FIXME: rename to isLeft + { + return any(((vint4)bin_unsafe(center2(ref.bounds())) < vSplitPos) & splitDimMask); + } + /*! calculates left spatial position of bin */ + __forceinline float pos(const size_t bin, const size_t dim) const { + return madd(float(bin),1.0f / scale[dim],ofs[dim]); + } + + /*! returns true if the mapping is invalid in some dimension */ + __forceinline bool invalid(const size_t dim) const { + return scale[dim] == 0.0f; + } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const BinMapping& mapping) { + return cout << "BinMapping { num = " << mapping.num << ", ofs = " << mapping.ofs << ", scale = " << mapping.scale << "}"; + } + + public: + size_t num; + vfloat4 ofs,scale; //!< linear function that maps to bin ID + }; + + /*! stores all information to perform some split */ + template + struct BinSplit + { + enum + { + SPLIT_OBJECT = 0, + SPLIT_FALLBACK = 1, + SPLIT_ENFORCE = 2, // splits with larger ID are enforced in createLargeLeaf even if we could create a leaf already + SPLIT_TEMPORAL = 2, + SPLIT_GEOMID = 3, + }; + + /*! construct an invalid split by default */ + __forceinline BinSplit() + : sah(inf), dim(-1), pos(0), data(0) {} + + __forceinline BinSplit(float sah, unsigned data, int dim = 0, float fpos = 0) + : sah(sah), dim(dim), fpos(fpos), data(data) {} + + /*! constructs specified split */ + __forceinline BinSplit(float sah, int dim, int pos, const BinMapping& mapping) + : sah(sah), dim(dim), pos(pos), data(0), mapping(mapping) {} + + /*! tests if this split is valid */ + __forceinline bool valid() const { return dim != -1; } + + /*! calculates surface area heuristic for performing the split */ + __forceinline float splitSAH() const { return sah; } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const BinSplit& split) { + return cout << "BinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << "}"; + } + + public: + float sah; //!< SAH cost of the split + int dim; //!< split dimension + union { int pos; float fpos; }; //!< bin index for splitting + unsigned int data; //!< extra optional split data + BinMapping mapping; //!< mapping into bins + }; + + /*! stores extended information about the split */ + template + struct SplitInfoT + { + + __forceinline SplitInfoT () {} + + __forceinline SplitInfoT (size_t leftCount, const BBox& leftBounds, size_t rightCount, const BBox& rightBounds) + : leftCount(leftCount), rightCount(rightCount), leftBounds(leftBounds), rightBounds(rightBounds) {} + + public: + size_t leftCount,rightCount; + BBox leftBounds,rightBounds; + }; + + typedef SplitInfoT SplitInfo; + typedef SplitInfoT SplitInfo2; + + /*! stores all binning information */ + template + struct __aligned(64) BinInfoT + { + typedef BinSplit Split; + typedef vbool4 vbool; + typedef vint4 vint; + typedef vfloat4 vfloat; + + __forceinline BinInfoT() { + } + + __forceinline BinInfoT(EmptyTy) { + clear(); + } + + /*! bin access function */ + __forceinline BBox &bounds(const size_t binID, const size_t dimID) { return _bounds[binID][dimID]; } + __forceinline const BBox &bounds(const size_t binID, const size_t dimID) const { return _bounds[binID][dimID]; } + + __forceinline unsigned int &counts(const size_t binID, const size_t dimID) { return _counts[binID][dimID]; } + __forceinline const unsigned int &counts(const size_t binID, const size_t dimID) const { return _counts[binID][dimID]; } + + __forceinline vuint4 &counts(const size_t binID) { return _counts[binID]; } + __forceinline const vuint4 &counts(const size_t binID) const { return _counts[binID]; } + + /*! clears the bin info */ + __forceinline void clear() + { + for (size_t i=0; i& mapping) + { + if (unlikely(N == 0)) return; + size_t i; + for (i=0; i(bin0); bounds(b00,0).extend(prim0); + const unsigned int b01 = extract<1>(bin0); bounds(b01,1).extend(prim0); + const unsigned int b02 = extract<2>(bin0); bounds(b02,2).extend(prim0); + const unsigned int s0 = (unsigned int)prims[i+0].size(); + counts(b00,0)+=s0; + counts(b01,1)+=s0; + counts(b02,2)+=s0; + + /*! increase bounds of bins for odd primitive */ + const unsigned int b10 = extract<0>(bin1); bounds(b10,0).extend(prim1); + const unsigned int b11 = extract<1>(bin1); bounds(b11,1).extend(prim1); + const unsigned int b12 = extract<2>(bin1); bounds(b12,2).extend(prim1); + const unsigned int s1 = (unsigned int)prims[i+1].size(); + counts(b10,0)+=s1; + counts(b11,1)+=s1; + counts(b12,2)+=s1; + } + /*! for uneven number of primitives */ + if (i < N) + { + /*! map primitive to bin */ + BBox prim0; Vec3fa center0; + prims[i].binBoundsAndCenter(prim0,center0); + const vint4 bin0 = (vint4)mapping.bin(center0); + + /*! increase bounds of bins */ + const unsigned int s0 = (unsigned int)prims[i].size(); + const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0); + const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0); + const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0); + } + } + + /*! bins an array of primitives */ + template + __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter) + { + if (N == 0) return; + + size_t i; + for (i=0; i(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0); + const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0); + const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0); + + /*! increase bounds of bins for odd primitive */ + const unsigned int s1 = prims[i+1].size(); + const int b10 = extract<0>(bin1); counts(b10,0)+=s1; bounds(b10,0).extend(prim1); + const int b11 = extract<1>(bin1); counts(b11,1)+=s1; bounds(b11,1).extend(prim1); + const int b12 = extract<2>(bin1); counts(b12,2)+=s1; bounds(b12,2).extend(prim1); + } + + /*! for uneven number of primitives */ + if (i < N) + { + /*! map primitive to bin */ + BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); + const vint4 bin0 = (vint4)mapping.bin(center0); + + /*! increase bounds of bins */ + const unsigned int s0 = prims[i+0].size(); + const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0); + const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0); + const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0); + } + } + + __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping& mapping) { + bin(prims+begin,end-begin,mapping); + } + + template + __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter) { + bin(prims+begin,end-begin,mapping,binBoundsAndCenter); + } + + /*! merges in other binning information */ + __forceinline void merge (const BinInfoT& other, size_t numBins) + { + + for (size_t i=0; i& mapping, const size_t blocks_shift) const + { + /* sweep from right to left and compute parallel prefix of merged bounds */ + vfloat4 rAreas[BINS]; + vuint4 rCounts[BINS]; + vuint4 count = 0; BBox bx = empty; BBox by = empty; BBox bz = empty; + for (size_t i=mapping.size()-1; i>0; i--) + { + count += counts(i); + rCounts[i] = count; + bx.extend(bounds(i,0)); rAreas[i][0] = expectedApproxHalfArea(bx); + by.extend(bounds(i,1)); rAreas[i][1] = expectedApproxHalfArea(by); + bz.extend(bounds(i,2)); rAreas[i][2] = expectedApproxHalfArea(bz); + rAreas[i][3] = 0.0f; + } + /* sweep from left to right and compute SAH */ + vuint4 blocks_add = (1 << blocks_shift)-1; + vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; + count = 0; bx = empty; by = empty; bz = empty; + for (size_t i=1; i> (unsigned int)(blocks_shift); // if blocks_shift >=1 then lCount < 4B and could be represented with an vint4, which would allow for faster vfloat4 conversions. + const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift); + const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount)); + //const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount))); + + vbestPos = select(sah < vbestSAH,ii ,vbestPos); + vbestSAH = select(sah < vbestSAH,sah,vbestSAH); + } + + /* find best dimension */ + float bestSAH = inf; + int bestDim = -1; + int bestPos = 0; + for (int dim=0; dim<3; dim++) + { + /* ignore zero sized dimensions */ + if (unlikely(mapping.invalid(dim))) + continue; + + /* test if this is a better dimension */ + if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) { + bestDim = dim; + bestPos = vbestPos[dim]; + bestSAH = vbestSAH[dim]; + } + } + return Split(bestSAH,bestDim,bestPos,mapping); + } + + /*! calculates extended split information */ + __forceinline void getSplitInfo(const BinMapping& mapping, const Split& split, SplitInfoT& info) const + { + if (split.dim == -1) { + new (&info) SplitInfoT(0,empty,0,empty); + return; + } + + size_t leftCount = 0; + BBox leftBounds = empty; + for (size_t i=0; i<(size_t)split.pos; i++) { + leftCount += counts(i,split.dim); + leftBounds.extend(bounds(i,split.dim)); + } + size_t rightCount = 0; + BBox rightBounds = empty; + for (size_t i=split.pos; i(leftCount,leftBounds,rightCount,rightBounds); + } + + /*! gets the number of primitives left of the split */ + __forceinline size_t getLeftCount(const BinMapping& mapping, const Split& split) const + { + if (unlikely(split.dim == -1)) return -1; + + size_t leftCount = 0; + for (size_t i = 0; i < (size_t)split.pos; i++) { + leftCount += counts(i, split.dim); + } + return leftCount; + } + + /*! gets the number of primitives right of the split */ + __forceinline size_t getRightCount(const BinMapping& mapping, const Split& split) const + { + if (unlikely(split.dim == -1)) return -1; + + size_t rightCount = 0; + for (size_t i = (size_t)split.pos; i + struct BinMapping<16> + { + public: + __forceinline BinMapping() {} + + /*! calculates the mapping */ + template + __forceinline BinMapping(const PrimInfo& pinfo) + { + num = 16; + const vfloat4 eps = 1E-34f; + const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size()); + scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); + ofs = (vfloat4) pinfo.centBounds.lower; + scale16 = scale; + ofs16 = ofs; + } + + /*! returns number of bins */ + __forceinline size_t size() const { return num; } + + __forceinline vint16 bin16(const Vec3fa& p) const { + return vint16(vint4(floori((vfloat4(p)-ofs)*scale))); + } + + __forceinline vint16 bin16(const vfloat16& p) const { + return floori((p-ofs16)*scale16); + } + + __forceinline int bin_unsafe(const PrimRef& ref, + const vint16& vSplitPos, + const vbool16& splitDimMask) const // FIXME: rename to isLeft + { + const vfloat16 lower(*(vfloat4*)&ref.lower); + const vfloat16 upper(*(vfloat4*)&ref.upper); + const vfloat16 p = lower + upper; + const vint16 i = floori((p-ofs16)*scale16); + return lt(splitDimMask,i,vSplitPos); + } + + /*! returns true if the mapping is invalid in some dimension */ + __forceinline bool invalid(const size_t dim) const { + return scale[dim] == 0.0f; + } + + public: + size_t num; + vfloat4 ofs,scale; //!< linear function that maps to bin ID + vfloat16 ofs16,scale16; //!< linear function that maps to bin ID + }; + + /* 16 bins in-register binner */ + template + struct __aligned(64) BinInfoT<16,PrimRef,BBox3fa> + { + typedef BinSplit<16> Split; + typedef vbool16 vbool; + typedef vint16 vint; + typedef vfloat16 vfloat; + + __forceinline BinInfoT() { + } + + __forceinline BinInfoT(EmptyTy) { + clear(); + } + + /*! clears the bin info */ + __forceinline void clear() + { + lower[0] = lower[1] = lower[2] = pos_inf; + upper[0] = upper[1] = upper[2] = neg_inf; + count[0] = count[1] = count[2] = 0; + } + + + static __forceinline vfloat16 prefix_area_rl(const vfloat16 min_x, + const vfloat16 min_y, + const vfloat16 min_z, + const vfloat16 max_x, + const vfloat16 max_y, + const vfloat16 max_z) + { + const vfloat16 r_min_x = reverse_prefix_min(min_x); + const vfloat16 r_min_y = reverse_prefix_min(min_y); + const vfloat16 r_min_z = reverse_prefix_min(min_z); + const vfloat16 r_max_x = reverse_prefix_max(max_x); + const vfloat16 r_max_y = reverse_prefix_max(max_y); + const vfloat16 r_max_z = reverse_prefix_max(max_z); + const vfloat16 dx = r_max_x - r_min_x; + const vfloat16 dy = r_max_y - r_min_y; + const vfloat16 dz = r_max_z - r_min_z; + const vfloat16 area_rl = madd(dx,dy,madd(dx,dz,dy*dz)); + return area_rl; + } + + static __forceinline vfloat16 prefix_area_lr(const vfloat16 min_x, + const vfloat16 min_y, + const vfloat16 min_z, + const vfloat16 max_x, + const vfloat16 max_y, + const vfloat16 max_z) + { + const vfloat16 r_min_x = prefix_min(min_x); + const vfloat16 r_min_y = prefix_min(min_y); + const vfloat16 r_min_z = prefix_min(min_z); + const vfloat16 r_max_x = prefix_max(max_x); + const vfloat16 r_max_y = prefix_max(max_y); + const vfloat16 r_max_z = prefix_max(max_z); + const vfloat16 dx = r_max_x - r_min_x; + const vfloat16 dy = r_max_y - r_min_y; + const vfloat16 dz = r_max_z - r_min_z; + const vfloat16 area_lr = madd(dx,dy,madd(dx,dz,dy*dz)); + return area_lr; + } + + + /*! bins an array of primitives */ + __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<16>& mapping) + { + if (unlikely(N == 0)) return; + + const vfloat16 init_min(pos_inf); + const vfloat16 init_max(neg_inf); + + vfloat16 min_x0,min_x1,min_x2; + vfloat16 min_y0,min_y1,min_y2; + vfloat16 min_z0,min_z1,min_z2; + vfloat16 max_x0,max_x1,max_x2; + vfloat16 max_y0,max_y1,max_y2; + vfloat16 max_z0,max_z1,max_z2; + vuint16 count0,count1,count2; + + min_x0 = init_min; + min_x1 = init_min; + min_x2 = init_min; + min_y0 = init_min; + min_y1 = init_min; + min_y2 = init_min; + min_z0 = init_min; + min_z1 = init_min; + min_z2 = init_min; + + max_x0 = init_max; + max_x1 = init_max; + max_x2 = init_max; + max_y0 = init_max; + max_y1 = init_max; + max_y2 = init_max; + max_z0 = init_max; + max_z1 = init_max; + max_z2 = init_max; + + count0 = zero; + count1 = zero; + count2 = zero; + + const vint16 step16(step); + size_t i; + for (i=0; i(binA); + const vint16 bin1 = shuffle<1>(binA); + const vint16 bin2 = shuffle<2>(binA); + + const vbool16 m_update_x = step16 == bin0; + const vbool16 m_update_y = step16 == bin1; + const vbool16 m_update_z = step16 == bin2; + + assert(popcnt((size_t)m_update_x) == 1); + assert(popcnt((size_t)m_update_y) == 1); + assert(popcnt((size_t)m_update_z) == 1); + + min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x); + min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y); + min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z); + // ------------------------------------------------------------------------ + max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x); + max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y); + max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z); + // ------------------------------------------------------------------------ + min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x); + min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y); + min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z); + // ------------------------------------------------------------------------ + max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x); + max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y); + max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z); + // ------------------------------------------------------------------------ + min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x); + min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y); + min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z); + // ------------------------------------------------------------------------ + max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x); + max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y); + max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z); + // ------------------------------------------------------------------------ + count0 = mask_add(m_update_x,count0,count0,vuint16(1)); + count1 = mask_add(m_update_y,count1,count1,vuint16(1)); + count2 = mask_add(m_update_z,count2,count2,vuint16(1)); + } + + + /* B */ + { + const vfloat16 b_min_x = prims[i+1].lower.x; + const vfloat16 b_min_y = prims[i+1].lower.y; + const vfloat16 b_min_z = prims[i+1].lower.z; + const vfloat16 b_max_x = prims[i+1].upper.x; + const vfloat16 b_max_y = prims[i+1].upper.y; + const vfloat16 b_max_z = prims[i+1].upper.z; + + const vint16 bin0 = shuffle<0>(binB); + const vint16 bin1 = shuffle<1>(binB); + const vint16 bin2 = shuffle<2>(binB); + + const vbool16 m_update_x = step16 == bin0; + const vbool16 m_update_y = step16 == bin1; + const vbool16 m_update_z = step16 == bin2; + + assert(popcnt((size_t)m_update_x) == 1); + assert(popcnt((size_t)m_update_y) == 1); + assert(popcnt((size_t)m_update_z) == 1); + + min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x); + min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y); + min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z); + // ------------------------------------------------------------------------ + max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x); + max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y); + max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z); + // ------------------------------------------------------------------------ + min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x); + min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y); + min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z); + // ------------------------------------------------------------------------ + max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x); + max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y); + max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z); + // ------------------------------------------------------------------------ + min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x); + min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y); + min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z); + // ------------------------------------------------------------------------ + max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x); + max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y); + max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z); + // ------------------------------------------------------------------------ + count0 = mask_add(m_update_x,count0,count0,vuint16(1)); + count1 = mask_add(m_update_y,count1,count1,vuint16(1)); + count2 = mask_add(m_update_z,count2,count2,vuint16(1)); + } + + } + + if (i < N) + { + const BBox3fa prim0 = prims[i].bounds(); + const vfloat16 center0 = vfloat16((vfloat4)prim0.lower) + vfloat16((vfloat4)prim0.upper); + const vint16 bin = mapping.bin16(center0); + + const vfloat16 b_min_x = prims[i].lower.x; + const vfloat16 b_min_y = prims[i].lower.y; + const vfloat16 b_min_z = prims[i].lower.z; + const vfloat16 b_max_x = prims[i].upper.x; + const vfloat16 b_max_y = prims[i].upper.y; + const vfloat16 b_max_z = prims[i].upper.z; + + const vint16 bin0 = shuffle<0>(bin); + const vint16 bin1 = shuffle<1>(bin); + const vint16 bin2 = shuffle<2>(bin); + + const vbool16 m_update_x = step16 == bin0; + const vbool16 m_update_y = step16 == bin1; + const vbool16 m_update_z = step16 == bin2; + + assert(popcnt((size_t)m_update_x) == 1); + assert(popcnt((size_t)m_update_y) == 1); + assert(popcnt((size_t)m_update_z) == 1); + + min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x); + min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y); + min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z); + // ------------------------------------------------------------------------ + max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x); + max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y); + max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z); + // ------------------------------------------------------------------------ + min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x); + min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y); + min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z); + // ------------------------------------------------------------------------ + max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x); + max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y); + max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z); + // ------------------------------------------------------------------------ + min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x); + min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y); + min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z); + // ------------------------------------------------------------------------ + max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x); + max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y); + max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z); + // ------------------------------------------------------------------------ + count0 = mask_add(m_update_x,count0,count0,vuint16(1)); + count1 = mask_add(m_update_y,count1,count1,vuint16(1)); + count2 = mask_add(m_update_z,count2,count2,vuint16(1)); + } + + lower[0] = Vec3vf16( min_x0, min_y0, min_z0 ); + lower[1] = Vec3vf16( min_x1, min_y1, min_z1 ); + lower[2] = Vec3vf16( min_x2, min_y2, min_z2 ); + + upper[0] = Vec3vf16( max_x0, max_y0, max_z0 ); + upper[1] = Vec3vf16( max_x1, max_y1, max_z1 ); + upper[2] = Vec3vf16( max_x2, max_y2, max_z2 ); + + count[0] = count0; + count[1] = count1; + count[2] = count2; + } + + __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<16>& mapping) { + bin(prims+begin,end-begin,mapping); + } + + /*! merges in other binning information */ + __forceinline void merge (const BinInfoT& other, size_t numBins) + { + for (size_t i=0; i<3; i++) + { + lower[i] = min(lower[i],other.lower[i]); + upper[i] = max(upper[i],other.upper[i]); + count[i] += other.count[i]; + } + } + + /*! reducesr binning information */ + static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b) + { + BinInfoT c; + for (size_t i=0; i<3; i++) + { + c.counts[i] = a.counts[i] + b.counts[i]; + c.lower[i] = min(a.lower[i],b.lower[i]); + c.upper[i] = max(a.upper[i],b.upper[i]); + } + return c; + } + + /*! finds the best split by scanning binning information */ + __forceinline Split best(const BinMapping<16>& mapping, const size_t blocks_shift) const + { + /* find best dimension */ + float bestSAH = inf; + int bestDim = -1; + int bestPos = 0; + const vuint16 blocks_add = (1 << blocks_shift)-1; + const vfloat16 inf(pos_inf); + for (size_t dim=0; dim<3; dim++) + { + /* ignore zero sized dimensions */ + if (unlikely(mapping.invalid(dim))) + continue; + + const vfloat16 rArea16 = prefix_area_rl(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z); + const vfloat16 lArea16 = prefix_area_lr(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z); + const vuint16 lCount16 = prefix_sum(count[dim]); + const vuint16 rCount16 = reverse_prefix_sum(count[dim]); + + /* compute best split in this dimension */ + const vfloat16 leftArea = lArea16; + const vfloat16 rightArea = align_shift_right<1>(zero,rArea16); + const vuint16 lC = lCount16; + const vuint16 rC = align_shift_right<1>(zero,rCount16); + const vuint16 leftCount = ( lC + blocks_add) >> blocks_shift; + const vuint16 rightCount = ( rC + blocks_add) >> blocks_shift; + const vbool16 valid = (leftArea < inf) & (rightArea < inf) & vbool16(0x7fff); // handles inf entries + const vfloat16 sah = select(valid,madd(leftArea,vfloat16(leftCount),rightArea*vfloat16(rightCount)),vfloat16(pos_inf)); + /* test if this is a better dimension */ + if (any(sah < vfloat16(bestSAH))) + { + const size_t index = select_min(sah); + assert(index < 15); + assert(sah[index] < bestSAH); + bestDim = dim; + bestPos = index+1; + bestSAH = sah[index]; + } + } + + return Split(bestSAH,bestDim,bestPos,mapping); + + } + + /*! calculates extended split information */ + __forceinline void getSplitInfo(const BinMapping<16>& mapping, const Split& split, SplitInfo& info) const + { + if (split.dim == -1) { + new (&info) SplitInfo(0,empty,0,empty); + return; + } + // FIXME: horizontal reduction! + + size_t leftCount = 0; + BBox3fa leftBounds = empty; + for (size_t i=0; i<(size_t)split.pos; i++) { + leftCount += count[split.dim][i]; + Vec3fa bounds_lower(lower[split.dim].x[i],lower[split.dim].y[i],lower[split.dim].z[i]); + Vec3fa bounds_upper(upper[split.dim].x[i],upper[split.dim].y[i],upper[split.dim].z[i]); + leftBounds.extend(BBox3fa(bounds_lower,bounds_upper)); + } + size_t rightCount = 0; + BBox3fa rightBounds = empty; + for (size_t i=split.pos; i& mapping, const Split& split) const + { + if (unlikely(split.dim == -1)) return -1; + + size_t leftCount = 0; + for (size_t i = 0; i < (size_t)split.pos; i++) { + leftCount += count[split.dim][i]; + } + return leftCount; + } + + /*! gets the number of primitives right of the split */ + __forceinline size_t getRightCount(const BinMapping<16>& mapping, const Split& split) const + { + if (unlikely(split.dim == -1)) return -1; + + size_t rightCount = 0; + for (size_t i = (size_t)split.pos; i + __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping) + { + if (likely(end-begin < parallelThreshold)) { + binner.bin(prims,begin,end,mapping); + } else { + binner = parallel_reduce(begin,end,blockSize,binner, + [&](const range& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; }, + [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); + } + } + + template + __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter) + { + if (likely(end-begin < parallelThreshold)) { + binner.bin(prims,begin,end,mapping,binBoundsAndCenter); + } else { + binner = parallel_reduce(begin,end,blockSize,binner, + [&](const range& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; }, + [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); + } + } + + template + __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping) + { + if (!parallel) { + binner.bin(prims,begin,end,mapping); + } else { + binner = parallel_reduce(begin,end,blockSize,binner, + [&](const range& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; }, + [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); + } + } + + template + __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter) + { + if (!parallel) { + binner.bin(prims,begin,end,mapping,binBoundsAndCenter); + } else { + binner = parallel_reduce(begin,end,blockSize,binner, + [&](const range& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; }, + [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h new file mode 100644 index 00000000000..a4c272f0151 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h @@ -0,0 +1,205 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "heuristic_binning.h" + +namespace embree +{ + namespace isa + { + struct PrimInfoRange : public CentGeomBBox3fa, public range + { + __forceinline PrimInfoRange () { + } + + __forceinline PrimInfoRange(const PrimInfo& pinfo) + : CentGeomBBox3fa(pinfo), range(pinfo.begin,pinfo.end) {} + + __forceinline PrimInfoRange(EmptyTy) + : CentGeomBBox3fa(EmptyTy()), range(0,0) {} + + __forceinline PrimInfoRange (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds) + : CentGeomBBox3fa(centGeomBounds), range(begin,end) {} + + __forceinline float leafSAH() const { + return expectedApproxHalfArea(geomBounds)*float(size()); + } + + __forceinline float leafSAH(size_t block_shift) const { + return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<> block_shift); + } + }; + + /*! Performs standard object binning */ + template + struct HeuristicArrayBinningSAH + { + typedef BinSplit Split; + typedef BinInfoT Binner; + typedef range Set; + +#if defined(__AVX512ER__) // KNL + static const size_t PARALLEL_THRESHOLD = 4*768; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 768; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 768; +#else + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; +#endif + __forceinline HeuristicArrayBinningSAH () + : prims(nullptr) {} + + /*! remember prim array */ + __forceinline HeuristicArrayBinningSAH (PrimRef* prims) + : prims(prims) {} + + /*! finds the best split */ + __noinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize) + { + if (likely(pinfo.size() < PARALLEL_THRESHOLD)) + return find_template(pinfo,logBlockSize); + else + return find_template(pinfo,logBlockSize); + } + + template + __forceinline const Split find_template(const PrimInfoRange& pinfo, const size_t logBlockSize) + { + Binner binner(empty); + const BinMapping mapping(pinfo); + bin_serial_or_parallel(binner,prims,pinfo.begin(),pinfo.end(),PARALLEL_FIND_BLOCK_SIZE,mapping); + return binner.best(mapping,logBlockSize); + } + + /*! array partitioning */ + __forceinline void split(const Split& split, const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo) + { + if (likely(pinfo.size() < PARALLEL_THRESHOLD)) + split_template(split,pinfo,linfo,rinfo); + else + split_template(split,pinfo,linfo,rinfo); + } + + template + __forceinline void split_template(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + if (!split.valid()) { + deterministic_order(set); + return splitFallback(set,lset,rset); + } + + const size_t begin = set.begin(); + const size_t end = set.end(); + CentGeomBBox3fa local_left(empty); + CentGeomBBox3fa local_right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const typename Binner::vint vSplitPos(splitPos); + const typename Binner::vbool vSplitMask(splitDimMask); + auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }; + + size_t center = 0; + if (!parallel) + center = serial_partitioning(prims,begin,end,local_left,local_right,isLeft, + [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }); + else + center = parallel_partitioning( + prims,begin,end,EmptyTy(),local_left,local_right,isLeft, + [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }, + [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); }, + PARALLEL_PARTITION_BLOCK_SIZE); + + new (&lset) PrimInfoRange(begin,center,local_left); + new (&rset) PrimInfoRange(center,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + } + + void deterministic_order(const PrimInfoRange& pinfo) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims[pinfo.begin()],&prims[pinfo.end()]); + } + + void splitFallback(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo) + { + const size_t begin = pinfo.begin(); + const size_t end = pinfo.end(); + const size_t center = (begin + end)/2; + + CentGeomBBox3fa left(empty); + for (size_t i=begin; i& range, PrimInfoRange& linfo, PrimInfoRange& rinfo) + { + assert(range.size() > 1); + CentGeomBBox3fa left(empty); + CentGeomBBox3fa right(empty); + unsigned int geomID = prims[range.begin()].geomID(); + size_t center = serial_partitioning(prims,range.begin(),range.end(),left,right, + [&] ( const PrimRef& prim ) { return prim.geomID() == geomID; }, + [ ] ( CentGeomBBox3fa& a, const PrimRef& ref ) { a.extend_center2(ref); }); + + new (&linfo) PrimInfoRange(range.begin(),center,left); + new (&rinfo) PrimInfoRange(center,range.end(),right); + } + + private: + PrimRef* const prims; + }; + + /*! Performs standard object binning */ + template + struct HeuristicArrayBinningMB + { + typedef BinSplit Split; + typedef typename PrimRefMB::BBox BBox; + typedef BinInfoT ObjectBinner; + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + /*! finds the best split */ + const Split find(const SetMB& set, const size_t logBlockSize) + { + ObjectBinner binner(empty); + const BinMapping mapping(set.size(),set.centBounds); + bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping); + Split osplit = binner.best(mapping,logBlockSize); + osplit.sah *= set.time_range.size(); + if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split + return osplit; + } + + /*! array partitioning */ + __forceinline void split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfoMB left = empty; + PrimInfoMB right = empty; + const vint4 vSplitPos(split.pos); + const vbool4 vSplitMask(1 << split.dim); + auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref) < vSplitPos) & vSplitMask); }; + auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); }; + auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); }; + size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD); + new (&lset) SetMB(left, set.prims,range(begin,center),set.time_range); + new (&rset) SetMB(right,set.prims,range(center,end ),set.time_range); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h new file mode 100644 index 00000000000..13702445862 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h @@ -0,0 +1,302 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "heuristic_binning.h" + +namespace embree +{ + namespace isa + { + /*! Performs standard object binning */ + template + struct UnalignedHeuristicArrayBinningSAH + { + typedef BinSplit Split; + typedef BinInfoT Binner; + typedef range Set; + + __forceinline UnalignedHeuristicArrayBinningSAH () // FIXME: required? + : scene(nullptr), prims(nullptr) {} + + /*! remember prim array */ + __forceinline UnalignedHeuristicArrayBinningSAH (Scene* scene, PrimRef* prims) + : scene(scene), prims(prims) {} + + const LinearSpace3fa computeAlignedSpace(const range& set) + { + Vec3fa axis(0,0,1); + uint64_t bestGeomPrimID = -1; + + /*! find curve with minimum ID that defines valid direction */ + for (size_t i=set.begin(); i= bestGeomPrimID) continue; + const Vec3fa axis1 = scene->get(geomID)->computeDirection(primID); + if (sqr_length(axis1) > 1E-18f) { + axis = normalize(axis1); + bestGeomPrimID = geomprimID; + } + } + return frame(axis).transposed(); + } + + const PrimInfo computePrimInfo(const range& set, const LinearSpace3fa& space) + { + auto computeBounds = [&](const range& r) -> CentGeomBBox3fa + { + CentGeomBBox3fa bounds(empty); + for (size_t i=r.begin(); iget(prims[i].geomID()); + bounds.extend(mesh->vbounds(space,prims[i].primID())); + } + return bounds; + }; + + const CentGeomBBox3fa bounds = parallel_reduce(set.begin(), set.end(), size_t(1024), size_t(4096), + CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2); + + return PrimInfo(set.begin(),set.end(),bounds); + } + + struct BinBoundsAndCenter + { + __forceinline BinBoundsAndCenter(Scene* scene, const LinearSpace3fa& space) + : scene(scene), space(space) {} + + /*! returns center for binning */ + __forceinline Vec3fa binCenter(const PrimRef& ref) const + { + Geometry* mesh = (Geometry*) scene->get(ref.geomID()); + BBox3fa bounds = mesh->vbounds(space,ref.primID()); + return embree::center2(bounds); + } + + /*! returns bounds and centroid used for binning */ + __forceinline void binBoundsAndCenter(const PrimRef& ref, BBox3fa& bounds_o, Vec3fa& center_o) const + { + Geometry* mesh = (Geometry*) scene->get(ref.geomID()); + BBox3fa bounds = mesh->vbounds(space,ref.primID()); + bounds_o = bounds; + center_o = embree::center2(bounds); + } + + private: + Scene* scene; + const LinearSpace3fa space; + }; + + /*! finds the best split */ + __forceinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize, const LinearSpace3fa& space) + { + if (likely(pinfo.size() < 10000)) + return find_template(pinfo,logBlockSize,space); + else + return find_template(pinfo,logBlockSize,space); + } + + /*! finds the best split */ + template + const Split find_template(const PrimInfoRange& set, const size_t logBlockSize, const LinearSpace3fa& space) + { + Binner binner(empty); + const BinMapping mapping(set); + BinBoundsAndCenter binBoundsAndCenter(scene,space); + bin_serial_or_parallel(binner,prims,set.begin(),set.end(),size_t(4096),mapping,binBoundsAndCenter); + return binner.best(mapping,logBlockSize); + } + + /*! array partitioning */ + __forceinline void split(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + if (likely(set.size() < 10000)) + split_template(split,space,set,lset,rset); + else + split_template(split,space,set,lset,rset); + } + + /*! array partitioning */ + template + __forceinline void split_template(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + if (!split.valid()) { + deterministic_order(set); + return splitFallback(set,lset,rset); + } + + const size_t begin = set.begin(); + const size_t end = set.end(); + CentGeomBBox3fa local_left(empty); + CentGeomBBox3fa local_right(empty); + const int splitPos = split.pos; + const int splitDim = split.dim; + BinBoundsAndCenter binBoundsAndCenter(scene,space); + + size_t center = 0; + if (likely(set.size() < 10000)) + center = serial_partitioning(prims,begin,end,local_left,local_right, + [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; }, + [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }); + else + center = parallel_partitioning(prims,begin,end,EmptyTy(),local_left,local_right, + [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; }, + [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }, + [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); }, + 128); + + new (&lset) PrimInfoRange(begin,center,local_left); + new (&rset) PrimInfoRange(center,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + } + + void deterministic_order(const range& set) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims[set.begin()],&prims[set.end()]); + } + + void splitFallback(const range& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + CentGeomBBox3fa left(empty); + for (size_t i=begin; i + struct UnalignedHeuristicArrayBinningMB + { + typedef BinSplit Split; + typedef typename PrimRefMB::BBox BBox; + typedef BinInfoT ObjectBinner; + + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + UnalignedHeuristicArrayBinningMB(Scene* scene) + : scene(scene) {} + + const LinearSpace3fa computeAlignedSpaceMB(Scene* scene, const SetMB& set) + { + Vec3fa axis0(0,0,1); + uint64_t bestGeomPrimID = -1; + + /*! find curve with minimum ID that defines valid direction */ + for (size_t i=set.begin(); i= bestGeomPrimID) continue; + + const Geometry* mesh = scene->get(geomID); + const range tbounds = mesh->timeSegmentRange(set.time_range); + if (tbounds.size() == 0) continue; + + const size_t t = (tbounds.begin()+tbounds.end())/2; + const Vec3fa axis1 = mesh->computeDirection(primID,t); + if (sqr_length(axis1) > 1E-18f) { + axis0 = normalize(axis1); + bestGeomPrimID = geomprimID; + } + } + + return frame(axis0).transposed(); + } + + struct BinBoundsAndCenter + { + __forceinline BinBoundsAndCenter(Scene* scene, BBox1f time_range, const LinearSpace3fa& space) + : scene(scene), time_range(time_range), space(space) {} + + /*! returns center for binning */ + template + __forceinline Vec3fa binCenter(const PrimRef& ref) const + { + Geometry* mesh = scene->get(ref.geomID()); + LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range); + return center2(lbounds.interpolate(0.5f)); + } + + /*! returns bounds and centroid used for binning */ + __noinline void binBoundsAndCenter (const PrimRefMB& ref, BBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX + { + Geometry* mesh = scene->get(ref.geomID()); + LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range); + bounds_o = lbounds.interpolate(0.5f); + center_o = center2(bounds_o); + } + + /*! returns bounds and centroid used for binning */ + __noinline void binBoundsAndCenter (const PrimRefMB& ref, LBBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX + { + Geometry* mesh = scene->get(ref.geomID()); + LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range); + bounds_o = lbounds; + center_o = center2(lbounds.interpolate(0.5f)); + } + + private: + Scene* scene; + BBox1f time_range; + const LinearSpace3fa space; + }; + + /*! finds the best split */ + const Split find(const SetMB& set, const size_t logBlockSize, const LinearSpace3fa& space) + { + BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space); + ObjectBinner binner(empty); + const BinMapping mapping(set.size(),set.centBounds); + bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping,binBoundsAndCenter); + Split osplit = binner.best(mapping,logBlockSize); + osplit.sah *= set.time_range.size(); + if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split + return osplit; + } + + /*! array partitioning */ + __forceinline void split(const Split& split, const LinearSpace3fa& space, const SetMB& set, SetMB& lset, SetMB& rset) + { + BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space); + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfoMB left = empty; + PrimInfoMB right = empty; + const vint4 vSplitPos(split.pos); + const vbool4 vSplitMask(1 << split.dim); + auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref,binBoundsAndCenter) < vSplitPos) & vSplitMask); }; + auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); }; + auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); }; + size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD); + new (&lset) SetMB(left,set.prims,range(begin,center),set.time_range); + new (&rset) SetMB(right,set.prims,range(center,end ),set.time_range); + } + + private: + Scene* scene; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h new file mode 100644 index 00000000000..21f18c0208f --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h @@ -0,0 +1,443 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +// TODO: +// - adjust parallel build thresholds +// - openNodesBasedOnExtend should consider max extended size + +#pragma once + +#include "heuristic_binning.h" +#include "heuristic_spatial.h" + +/* stop opening of all bref.geomIDs are the same */ +#define EQUAL_GEOMID_STOP_CRITERIA 1 + +/* 10% spatial extend threshold */ +#define MAX_EXTEND_THRESHOLD 0.1f + +/* maximum is 8 children */ +#define MAX_OPENED_CHILD_NODES 8 + +/* open until all build refs are below threshold size in one step */ +#define USE_LOOP_OPENING 0 + +namespace embree +{ + namespace isa + { + /*! Performs standard object binning */ + template + struct HeuristicArrayOpenMergeSAH + { + typedef BinSplit Split; + typedef BinInfoT Binner; + + static const size_t PARALLEL_THRESHOLD = 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 512; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + static const size_t MOVE_STEP_SIZE = 64; + static const size_t CREATE_SPLITS_STEP_SIZE = 128; + + __forceinline HeuristicArrayOpenMergeSAH () + : prims0(nullptr) {} + + /*! remember prim array */ + __forceinline HeuristicArrayOpenMergeSAH (const NodeOpenerFunc& nodeOpenerFunc, PrimRef* prims0, size_t max_open_size) + : prims0(prims0), nodeOpenerFunc(nodeOpenerFunc), max_open_size(max_open_size) + { + assert(max_open_size <= MAX_OPENED_CHILD_NODES); + } + + struct OpenHeuristic + { + __forceinline OpenHeuristic( const PrimInfoExtRange& pinfo ) + { + const Vec3fa diag = pinfo.geomBounds.size(); + dim = maxDim(diag); + assert(diag[dim] > 0.0f); + inv_max_extend = 1.0f / diag[dim]; + } + + __forceinline bool operator () ( PrimRef& prim ) const { + return !prim.node.isLeaf() && prim.bounds().size()[dim] * inv_max_extend > MAX_EXTEND_THRESHOLD; + } + + private: + size_t dim; + float inv_max_extend; + }; + + /*! compute extended ranges */ + __forceinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight) + { + assert(set.ext_range_size() > 0); + const float left_factor = (float)lweight / (lweight + rweight); + const size_t ext_range_size = set.ext_range_size(); + const size_t left_ext_range_size = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size); + const size_t right_ext_range_size = ext_range_size - left_ext_range_size; + lset.set_ext_range(lset.end() + left_ext_range_size); + rset.set_ext_range(rset.end() + right_ext_range_size); + } + + /*! move ranges */ + __forceinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t left_ext_range_size = lset.ext_range_size(); + const size_t right_size = rset.size(); + + /* has the left child an extended range? */ + if (left_ext_range_size > 0) + { + /* left extended range smaller than right range ? */ + if (left_ext_range_size < right_size) + { + /* only move a small part of the beginning of the right range to the end */ + parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range& r) { + for (size_t i=r.begin(); i& r) { + for (size_t i=r.begin(); i getProperties(const PrimInfoExtRange& set) + { + const OpenHeuristic heuristic(set); + const unsigned int geomID = prims0[set.begin()].geomID(); + + auto body = [&] (const range& r) -> std::pair { + bool commonGeomID = true; + size_t opens = 0; + for (size_t i=r.begin(); i(opens,commonGeomID); + }; + auto reduction = [&] (const std::pair& b0, const std::pair& b1) -> std::pair { + return std::pair(b0.first+b1.first,b0.second && b1.second); + }; + return parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,std::pair(0,true),body,reduction); + } + + // FIXME: should consider maximum available extended size + __noinline void openNodesBasedOnExtend(PrimInfoExtRange& set) + { + const OpenHeuristic heuristic(set); + const size_t ext_range_start = set.end(); + + if (false && set.size() < PARALLEL_THRESHOLD) + { + size_t extra_elements = 0; + for (size_t i=set.begin(); i ext_elements; + ext_elements.store(0); + PrimInfo info = parallel_reduce( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, PrimInfo(empty), [&](const range& r) -> PrimInfo { + PrimInfo info(empty); + for (size_t i=r.begin(); i 0); + + if (unlikely(next_iteration_extra_elements == 0)) break; + } + } + + __noinline const Split find(PrimInfoExtRange& set, const size_t logBlockSize) + { + /* single element */ + if (set.size() <= 1) + return Split(); + + /* disable opening if there is no overlap */ + const size_t D = 4; + if (unlikely(set.has_ext_range() && set.size() <= D)) + { + bool disjoint = true; + for (size_t j=set.begin(); j p(0,false); + + /* disable opening when all primitives are from same geometry */ + if (unlikely(set.has_ext_range())) + { + p = getProperties(set); +#if EQUAL_GEOMID_STOP_CRITERIA == 1 + if (p.second) set.set_ext_range(set.end()); /* disable opening */ +#endif + } + + /* open nodes when we have sufficient space available */ + if (unlikely(set.has_ext_range())) + { +#if USE_LOOP_OPENING == 1 + openNodesBasedOnExtendLoop(set,p.first); +#else + if (p.first <= set.ext_range_size()) + openNodesBasedOnExtend(set); +#endif + + /* disable opening when unsufficient space for opening a node available */ + if (set.ext_range_size() < max_open_size-1) + set.set_ext_range(set.end()); /* disable opening */ + } + + /* find best split */ + return object_find(set,logBlockSize); + } + + + /*! finds the best object split */ + __forceinline const Split object_find(const PrimInfoExtRange& set,const size_t logBlockSize) + { + if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize); + else return parallel_object_find (set,logBlockSize); + } + + /*! finds the best object split */ + __noinline const Split sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + Binner binner(empty); + const BinMapping mapping(set.centBounds); + binner.bin(prims0,set.begin(),set.end(),mapping); + return binner.best(mapping,logBlockSize); + } + + /*! finds the best split */ + __noinline const Split parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + Binner binner(empty); + const BinMapping mapping(set.centBounds); + const BinMapping& _mapping = mapping; // CLANG 3.4 parser bug workaround + auto body = [&] (const range& r) -> Binner { + Binner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; + }; + auto reduction = [&] (const Binner& b0, const Binner& b1) -> Binner { + Binner r = b0; r.merge(b1,_mapping.size()); return r; + }; + binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,body,reduction); + return binner.best(mapping,logBlockSize); + } + + /*! array partitioning */ + __noinline void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + PrimInfoExtRange set = set_i; + + /* valid split */ + if (unlikely(!split.valid())) { + deterministic_order(set); + splitFallback(set,lset,rset); + return; + } + + std::pair ext_weights(0,0); + + /* object split */ + if (likely(set.size() < PARALLEL_THRESHOLD)) + ext_weights = sequential_object_split(split,set,lset,rset); + else + ext_weights = parallel_object_split(split,set,lset,rset); + + /* if we have an extended range, set extended child ranges and move right split range */ + if (unlikely(set.has_ext_range())) + { + setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second); + moveExtentedRange(set,lset,rset); + } + } + + /*! array partitioning */ + std::pair sequential_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo local_left(empty); + PrimInfo local_right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const vint4 vSplitPos(splitPos); + const vbool4 vSplitMask( (int)splitDimMask ); + + size_t center = serial_partitioning(prims0, + begin,end,local_left,local_right, + [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }, + [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); }); + + new (&lset) PrimInfoExtRange(begin,center,center,local_left); + new (&rset) PrimInfoExtRange(center,end,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + return std::pair(local_left.size(),local_right.size()); + } + + /*! array partitioning */ + __noinline std::pair parallel_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo left(empty); + PrimInfo right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const vint4 vSplitPos(splitPos); + const vbool4 vSplitMask( (int)splitDimMask ); + auto isLeft = [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }; + + const size_t center = parallel_partitioning( + prims0,begin,end,EmptyTy(),left,right,isLeft, + [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); }, + [] (PrimInfo& pinfo0,const PrimInfo& pinfo1) { pinfo0.merge(pinfo1); }, + PARALLEL_PARTITION_BLOCK_SIZE); + + new (&lset) PrimInfoExtRange(begin,center,center,left); + new (&rset) PrimInfoExtRange(center,end,end,right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + + return std::pair(left.size(),right.size()); + } + + void deterministic_order(const extended_range& set) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims0[set.begin()],&prims0[set.end()]); + } + + __forceinline void splitFallback(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + PrimInfo left(empty); + for (size_t i=begin; i + struct SpatialBinMapping + { + public: + __forceinline SpatialBinMapping() {} + + /*! calculates the mapping */ + __forceinline SpatialBinMapping(const CentGeomBBox3fa& pinfo) + { + const vfloat4 lower = (vfloat4) pinfo.geomBounds.lower; + const vfloat4 upper = (vfloat4) pinfo.geomBounds.upper; + const vfloat4 eps = 128.0f*vfloat4(ulp)*max(abs(lower),abs(upper)); + const vfloat4 diag = max(eps,(vfloat4) pinfo.geomBounds.size()); + scale = select(upper-lower <= eps,vfloat4(0.0f),vfloat4(BINS)/diag); + ofs = (vfloat4) pinfo.geomBounds.lower; + inv_scale = 1.0f / scale; + } + + /*! slower but safe binning */ + __forceinline vint4 bin(const Vec3fa& p) const + { + const vint4 i = floori((vfloat4(p)-ofs)*scale); + return clamp(i,vint4(0),vint4(BINS-1)); + } + + __forceinline std::pair bin(const BBox3fa& b) const + { +#if defined(__AVX__) + const vfloat8 ofs8(ofs); + const vfloat8 scale8(scale); + const vint8 lu = floori((vfloat8::loadu(&b)-ofs8)*scale8); + const vint8 c_lu = clamp(lu,vint8(zero),vint8(BINS-1)); + return std::pair(extract4<0>(c_lu),extract4<1>(c_lu)); +#else + const vint4 lower = floori((vfloat4(b.lower)-ofs)*scale); + const vint4 upper = floori((vfloat4(b.upper)-ofs)*scale); + const vint4 c_lower = clamp(lower,vint4(0),vint4(BINS-1)); + const vint4 c_upper = clamp(upper,vint4(0),vint4(BINS-1)); + return std::pair(c_lower,c_upper); +#endif + } + + + /*! calculates left spatial position of bin */ + __forceinline float pos(const size_t bin, const size_t dim) const { + return madd(float(bin),inv_scale[dim],ofs[dim]); + } + + /*! calculates left spatial position of bin */ + template + __forceinline vfloat posN(const vfloat bin, const size_t dim) const { + return madd(bin,vfloat(inv_scale[dim]),vfloat(ofs[dim])); + } + + /*! returns true if the mapping is invalid in some dimension */ + __forceinline bool invalid(const size_t dim) const { + return scale[dim] == 0.0f; + } + + public: + vfloat4 ofs,scale,inv_scale; //!< linear function that maps to bin ID + }; + + /*! stores all information required to perform some split */ + template + struct SpatialBinSplit + { + /*! construct an invalid split by default */ + __forceinline SpatialBinSplit() + : sah(inf), dim(-1), pos(0), left(-1), right(-1), factor(1.0f) {} + + /*! constructs specified split */ + __forceinline SpatialBinSplit(float sah, int dim, int pos, const SpatialBinMapping& mapping) + : sah(sah), dim(dim), pos(pos), left(-1), right(-1), factor(1.0f), mapping(mapping) {} + + /*! constructs specified split */ + __forceinline SpatialBinSplit(float sah, int dim, int pos, int left, int right, float factor, const SpatialBinMapping& mapping) + : sah(sah), dim(dim), pos(pos), left(left), right(right), factor(factor), mapping(mapping) {} + + /*! tests if this split is valid */ + __forceinline bool valid() const { return dim != -1; } + + /*! calculates surface area heuristic for performing the split */ + __forceinline float splitSAH() const { return sah; } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const SpatialBinSplit& split) { + return cout << "SpatialBinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << ", left = " << split.left << ", right = " << split.right << ", factor = " << split.factor << "}"; + } + + public: + float sah; //!< SAH cost of the split + int dim; //!< split dimension + int pos; //!< split position + int left; //!< number of elements on the left side + int right; //!< number of elements on the right side + float factor; //!< factor splitting the extended range + SpatialBinMapping mapping; //!< mapping into bins + }; + + /*! stores all binning information */ + template + struct __aligned(64) SpatialBinInfo + { + SpatialBinInfo() { + } + + __forceinline SpatialBinInfo(EmptyTy) { + clear(); + } + + /*! clears the bin info */ + __forceinline void clear() + { + for (size_t i=0; i + __forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping& mapping) + { + for (size_t i=0; i> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); + + if (unlikely(splits == 1)) + { + const vint4 bin = mapping.bin(center(prim.bounds())); + for (size_t dim=0; dim<3; dim++) + { + assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS); + numBegin[bin[dim]][dim]++; + numEnd [bin[dim]][dim]++; + bounds [bin[dim]][dim].extend(prim.bounds()); + } + } + else + { + const vint4 bin0 = mapping.bin(prim.bounds().lower); + const vint4 bin1 = mapping.bin(prim.bounds().upper); + + for (size_t dim=0; dim<3; dim++) + { + size_t bin; + PrimRef rest = prim; + size_t l = bin0[dim]; + size_t r = bin1[dim]; + + // same bin optimization + if (likely(l == r)) + { + numBegin[l][dim]++; + numEnd [l][dim]++; + bounds [l][dim].extend(prim.bounds()); + continue; + } + + for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++) + { + const float pos = mapping.pos(bin+1,dim); + + PrimRef left,right; + splitPrimitive(rest,(int)dim,pos,left,right); + if (unlikely(left.bounds().empty())) l++; + bounds[bin][dim].extend(left.bounds()); + rest = right; + } + if (unlikely(rest.bounds().empty())) r--; + numBegin[l][dim]++; + numEnd [r][dim]++; + bounds [bin][dim].extend(rest.bounds()); + } + } + } + } + + /*! bins a range of primitives inside an array */ + template + void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping& mapping) { + bin(splitPrimitive,prims+begin,end-begin,mapping); + } + + /*! bins an array of primitives */ + template + __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping& mapping) + { + for (size_t i=begin; i& mapping) + { + for (size_t i=begin; i best(const SpatialBinMapping& mapping, const size_t blocks_shift) const + { + /* sweep from right to left and compute parallel prefix of merged bounds */ + vfloat4 rAreas[BINS]; + vuint4 rCounts[BINS]; + vuint4 count = 0; BBox3fa bx = empty; BBox3fa by = empty; BBox3fa bz = empty; + for (size_t i=BINS-1; i>0; i--) + { + count += numEnd[i]; + rCounts[i] = count; + bx.extend(bounds[i][0]); rAreas[i][0] = halfArea(bx); + by.extend(bounds[i][1]); rAreas[i][1] = halfArea(by); + bz.extend(bounds[i][2]); rAreas[i][2] = halfArea(bz); + rAreas[i][3] = 0.0f; + } + + /* sweep from left to right and compute SAH */ + vuint4 blocks_add = (1 << blocks_shift)-1; + vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; vuint4 vbestlCount = 0; vuint4 vbestrCount = 0; + count = 0; bx = empty; by = empty; bz = empty; + for (size_t i=1; i> (unsigned int)(blocks_shift); + const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift); + const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount)); + // const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount))); + const vbool4 mask = sah < vbestSAH; + vbestPos = select(mask,ii ,vbestPos); + vbestSAH = select(mask,sah,vbestSAH); + vbestlCount = select(mask,count,vbestlCount); + vbestrCount = select(mask,rCounts[i],vbestrCount); + } + + /* find best dimension */ + float bestSAH = inf; + int bestDim = -1; + int bestPos = 0; + unsigned int bestlCount = 0; + unsigned int bestrCount = 0; + for (int dim=0; dim<3; dim++) + { + /* ignore zero sized dimensions */ + if (unlikely(mapping.invalid(dim))) + continue; + + /* test if this is a better dimension */ + if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) { + bestDim = dim; + bestPos = vbestPos[dim]; + bestSAH = vbestSAH[dim]; + bestlCount = vbestlCount[dim]; + bestrCount = vbestrCount[dim]; + } + } + assert(bestSAH >= 0.0f); + + /* return invalid split if no split found */ + if (bestDim == -1) + return SpatialBinSplit(inf,-1,0,mapping); + + /* return best found split */ + return SpatialBinSplit(bestSAH,bestDim,bestPos,bestlCount,bestrCount,1.0f,mapping); + } + + private: + BBox3fa bounds[BINS][3]; //!< geometry bounds for each bin in each dimension + vuint4 numBegin[BINS]; //!< number of primitives starting in bin + vuint4 numEnd[BINS]; //!< number of primitives ending in bin + }; + } +} + diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h new file mode 100644 index 00000000000..911dcf950ca --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h @@ -0,0 +1,552 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "heuristic_binning.h" +#include "heuristic_spatial.h" + +namespace embree +{ + namespace isa + { +#if 0 +#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.2f +#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.95f +#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.0f +#else +#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.1f +#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.99f +#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.000005f +#endif + + struct PrimInfoExtRange : public CentGeomBBox3fa, public extended_range + { + __forceinline PrimInfoExtRange() { + } + + __forceinline PrimInfoExtRange(EmptyTy) + : CentGeomBBox3fa(EmptyTy()), extended_range(0,0,0) {} + + __forceinline PrimInfoExtRange(size_t begin, size_t end, size_t ext_end, const CentGeomBBox3fa& centGeomBounds) + : CentGeomBBox3fa(centGeomBounds), extended_range(begin,end,ext_end) {} + + __forceinline float leafSAH() const { + return expectedApproxHalfArea(geomBounds)*float(size()); + } + + __forceinline float leafSAH(size_t block_shift) const { + return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<> block_shift); + } + }; + + template + struct Split2 + { + __forceinline Split2 () {} + + __forceinline Split2 (const Split2& other) + { + spatial = other.spatial; + sah = other.sah; + if (spatial) spatialSplit() = other.spatialSplit(); + else objectSplit() = other.objectSplit(); + } + + __forceinline Split2& operator= (const Split2& other) + { + spatial = other.spatial; + sah = other.sah; + if (spatial) spatialSplit() = other.spatialSplit(); + else objectSplit() = other.objectSplit(); + return *this; + } + + __forceinline ObjectSplit& objectSplit() { return *( ObjectSplit*)data; } + __forceinline const ObjectSplit& objectSplit() const { return *(const ObjectSplit*)data; } + + __forceinline SpatialSplit& spatialSplit() { return *( SpatialSplit*)data; } + __forceinline const SpatialSplit& spatialSplit() const { return *(const SpatialSplit*)data; } + + __forceinline Split2 (const ObjectSplit& objectSplit, float sah) + : spatial(false), sah(sah) + { + new (data) ObjectSplit(objectSplit); + } + + __forceinline Split2 (const SpatialSplit& spatialSplit, float sah) + : spatial(true), sah(sah) + { + new (data) SpatialSplit(spatialSplit); + } + + __forceinline float splitSAH() const { + return sah; + } + + __forceinline bool valid() const { + return sah < float(inf); + } + + public: + __aligned(64) char data[sizeof(ObjectSplit) > sizeof(SpatialSplit) ? sizeof(ObjectSplit) : sizeof(SpatialSplit)]; + bool spatial; + float sah; + }; + + /*! Performs standard object binning */ + template + struct HeuristicArraySpatialSAH + { + typedef BinSplit ObjectSplit; + typedef BinInfoT ObjectBinner; + + typedef SpatialBinSplit SpatialSplit; + typedef SpatialBinInfo SpatialBinner; + + //typedef extended_range Set; + typedef Split2 Split; + +#if defined(__AVX512ER__) // KNL + static const size_t PARALLEL_THRESHOLD = 3*1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 768; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; +#else + static const size_t PARALLEL_THRESHOLD = 3*1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; +#endif + + static const size_t MOVE_STEP_SIZE = 64; + static const size_t CREATE_SPLITS_STEP_SIZE = 64; + + __forceinline HeuristicArraySpatialSAH () + : prims0(nullptr) {} + + /*! remember prim array */ + __forceinline HeuristicArraySpatialSAH (const PrimitiveSplitterFactory& splitterFactory, PrimRef* prims0, const CentGeomBBox3fa& root_info) + : prims0(prims0), splitterFactory(splitterFactory), root_info(root_info) {} + + + /*! compute extended ranges */ + __noinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight) + { + assert(set.ext_range_size() > 0); + const float left_factor = (float)lweight / (lweight + rweight); + const size_t ext_range_size = set.ext_range_size(); + const size_t left_ext_range_size = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size); + const size_t right_ext_range_size = ext_range_size - left_ext_range_size; + lset.set_ext_range(lset.end() + left_ext_range_size); + rset.set_ext_range(rset.end() + right_ext_range_size); + } + + /*! move ranges */ + __noinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t left_ext_range_size = lset.ext_range_size(); + const size_t right_size = rset.size(); + + /* has the left child an extended range? */ + if (left_ext_range_size > 0) + { + /* left extended range smaller than right range ? */ + if (left_ext_range_size < right_size) + { + /* only move a small part of the beginning of the right range to the end */ + parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range& r) { + for (size_t i=r.begin(); i& r) { + for (size_t i=r.begin(); i= SPATIAL_ASPLIT_AREA_THRESHOLD*safeArea(root_info.geomBounds) && + safeArea(overlap) >= SPATIAL_ASPLIT_OVERLAP_THRESHOLD*safeArea(set.geomBounds)) + { + const SpatialSplit spatial_split = spatial_find(set, logBlockSize); + const float spatial_split_sah = spatial_split.splitSAH(); + + /* valid spatial split, better SAH and number of splits do not exceed extended range */ + if (spatial_split_sah < SPATIAL_ASPLIT_SAH_THRESHOLD*object_split_sah && + spatial_split.left + spatial_split.right - set.size() <= set.ext_range_size()) + { + return Split(spatial_split,spatial_split_sah); + } + } + } + + return Split(object_split,object_split_sah); + } + + /*! finds the best object split */ + __forceinline const ObjectSplit object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info) + { + if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize,info); + else return parallel_object_find (set,logBlockSize,info); + } + + /*! finds the best object split */ + __noinline const ObjectSplit sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info) + { + ObjectBinner binner(empty); + const BinMapping mapping(set); + binner.bin(prims0,set.begin(),set.end(),mapping); + ObjectSplit s = binner.best(mapping,logBlockSize); + binner.getSplitInfo(mapping, s, info); + return s; + } + + /*! finds the best split */ + __noinline const ObjectSplit parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info) + { + ObjectBinner binner(empty); + const BinMapping mapping(set); + const BinMapping& _mapping = mapping; // CLANG 3.4 parser bug workaround + binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner, + [&] (const range& r) -> ObjectBinner { ObjectBinner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; }, + [&] (const ObjectBinner& b0, const ObjectBinner& b1) -> ObjectBinner { ObjectBinner r = b0; r.merge(b1,_mapping.size()); return r; }); + ObjectSplit s = binner.best(mapping,logBlockSize); + binner.getSplitInfo(mapping, s, info); + return s; + } + + /*! finds the best spatial split */ + __forceinline const SpatialSplit spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + if (set.size() < PARALLEL_THRESHOLD) return sequential_spatial_find(set, logBlockSize); + else return parallel_spatial_find (set, logBlockSize); + } + + /*! finds the best spatial split */ + __noinline const SpatialSplit sequential_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + SpatialBinner binner(empty); + const SpatialBinMapping mapping(set); + binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping); + /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/ + return binner.best(mapping,logBlockSize); //,set.ext_size()); + } + + __noinline const SpatialSplit parallel_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + SpatialBinner binner(empty); + const SpatialBinMapping mapping(set); + const SpatialBinMapping& _mapping = mapping; // CLANG 3.4 parser bug workaround + binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner, + [&] (const range& r) -> SpatialBinner { + SpatialBinner binner(empty); + binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping); + return binner; }, + [&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); }); + /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/ + return binner.best(mapping,logBlockSize); //,set.ext_size()); + } + + + /*! subdivides primitives based on a spatial split */ + __noinline void create_spatial_splits(PrimInfoExtRange& set, const SpatialSplit& split, const SpatialBinMapping &mapping) + { + assert(set.has_ext_range()); + const size_t max_ext_range_size = set.ext_range_size(); + const size_t ext_range_start = set.end(); + + /* atomic counter for number of primref splits */ + std::atomic ext_elements; + ext_elements.store(0); + + const float fpos = split.mapping.pos(split.pos,split.dim); + + const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; + + parallel_for( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, [&](const range& r) { + for (size_t i=r.begin();i> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); + + if (likely(splits <= 1)) continue; /* todo: does this ever happen ? */ + + //int bin0 = split.mapping.bin(prims0[i].lower)[split.dim]; + //int bin1 = split.mapping.bin(prims0[i].upper)[split.dim]; + //if (unlikely(bin0 < split.pos && bin1 >= split.pos)) + if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos)) + { + assert(splits > 1); + + PrimRef left,right; + const auto splitter = splitterFactory(prims0[i]); + splitter(prims0[i],split.dim,fpos,left,right); + + // no empty splits + if (unlikely(left.bounds().empty() || right.bounds().empty())) continue; + + left.lower.u = (left.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); + right.lower.u = (right.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); + + const size_t ID = ext_elements.fetch_add(1); + + /* break if the number of subdivided elements are greater than the maximum allowed size */ + if (unlikely(ID >= max_ext_range_size)) + break; + + /* only write within the correct bounds */ + assert(ID < max_ext_range_size); + prims0[i] = left; + prims0[ext_range_start+ID] = right; + } + } + }); + + const size_t numExtElements = min(max_ext_range_size,ext_elements.load()); + assert(set.end()+numExtElements<=set.ext_end()); + set._end += numExtElements; + } + + /*! array partitioning */ + void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + PrimInfoExtRange set = set_i; + + /* valid split */ + if (unlikely(!split.valid())) { + deterministic_order(set); + return splitFallback(set,lset,rset); + } + + std::pair ext_weights(0,0); + + if (unlikely(split.spatial)) + { + create_spatial_splits(set,split.spatialSplit(), split.spatialSplit().mapping); + + /* spatial split */ + if (likely(set.size() < PARALLEL_THRESHOLD)) + ext_weights = sequential_spatial_split(split.spatialSplit(),set,lset,rset); + else + ext_weights = parallel_spatial_split(split.spatialSplit(),set,lset,rset); + } + else + { + /* object split */ + if (likely(set.size() < PARALLEL_THRESHOLD)) + ext_weights = sequential_object_split(split.objectSplit(),set,lset,rset); + else + ext_weights = parallel_object_split(split.objectSplit(),set,lset,rset); + } + + /* if we have an extended range, set extended child ranges and move right split range */ + if (unlikely(set.has_ext_range())) + { + setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second); + moveExtentedRange(set,lset,rset); + } + } + + /*! array partitioning */ + std::pair sequential_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo local_left(empty); + PrimInfo local_right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const typename ObjectBinner::vint vSplitPos(splitPos); + const typename ObjectBinner::vbool vSplitMask(splitDimMask); + size_t center = serial_partitioning(prims0, + begin,end,local_left,local_right, + [&] (const PrimRef& ref) { + return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); + }, + [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }); + const size_t left_weight = local_left.end; + const size_t right_weight = local_right.end; + + new (&lset) PrimInfoExtRange(begin,center,center,local_left); + new (&rset) PrimInfoExtRange(center,end,end,local_right); + + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + return std::pair(left_weight,right_weight); + } + + + /*! array partitioning */ + __noinline std::pair sequential_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo local_left(empty); + PrimInfo local_right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + /* init spatial mapping */ + const SpatialBinMapping &mapping = split.mapping; + const vint4 vSplitPos(splitPos); + const vbool4 vSplitMask( (int)splitDimMask ); + + size_t center = serial_partitioning(prims0, + begin,end,local_left,local_right, + [&] (const PrimRef& ref) { + const Vec3fa c = ref.bounds().center(); + return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); + }, + [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }); + + const size_t left_weight = local_left.end; + const size_t right_weight = local_right.end; + + new (&lset) PrimInfoExtRange(begin,center,center,local_left); + new (&rset) PrimInfoExtRange(center,end,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + return std::pair(left_weight,right_weight); + } + + + + /*! array partitioning */ + __noinline std::pair parallel_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo left(empty); + PrimInfo right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const typename ObjectBinner::vint vSplitPos(splitPos); + const typename ObjectBinner::vbool vSplitMask(splitDimMask); + auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }; + + const size_t center = parallel_partitioning( + prims0,begin,end,EmptyTy(),left,right,isLeft, + [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }, + [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); }, + PARALLEL_PARTITION_BLOCK_SIZE); + + const size_t left_weight = left.end; + const size_t right_weight = right.end; + + left.begin = begin; left.end = center; + right.begin = center; right.end = end; + + new (&lset) PrimInfoExtRange(begin,center,center,left); + new (&rset) PrimInfoExtRange(center,end,end,right); + + assert(area(left.geomBounds) >= 0.0f); + assert(area(right.geomBounds) >= 0.0f); + return std::pair(left_weight,right_weight); + } + + /*! array partitioning */ + __noinline std::pair parallel_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo left(empty); + PrimInfo right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + /* init spatial mapping */ + const SpatialBinMapping& mapping = split.mapping; + const vint4 vSplitPos(splitPos); + const vbool4 vSplitMask( (int)splitDimMask ); + + auto isLeft = [&] (const PrimRef &ref) { + const Vec3fa c = ref.bounds().center(); + return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); }; + + const size_t center = parallel_partitioning( + prims0,begin,end,EmptyTy(),left,right,isLeft, + [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }, + [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); }, + PARALLEL_PARTITION_BLOCK_SIZE); + + const size_t left_weight = left.end; + const size_t right_weight = right.end; + + left.begin = begin; left.end = center; + right.begin = center; right.end = end; + + new (&lset) PrimInfoExtRange(begin,center,center,left); + new (&rset) PrimInfoExtRange(center,end,end,right); + + assert(area(left.geomBounds) >= 0.0f); + assert(area(right.geomBounds) >= 0.0f); + return std::pair(left_weight,right_weight); + } + + void deterministic_order(const PrimInfoExtRange& set) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims0[set.begin()],&prims0[set.end()]); + } + + void splitFallback(const PrimInfoExtRange& set, + PrimInfoExtRange& lset, + PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + PrimInfo left(empty); + for (size_t i=begin; i> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); + } + const size_t lweight = left.end; + + PrimInfo right(empty); + for (size_t i=center; i> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); + } + const size_t rweight = right.end; + + new (&lset) PrimInfoExtRange(begin,center,center,left); + new (&rset) PrimInfoExtRange(center,end,end,right); + + /* if we have an extended range */ + if (set.has_ext_range()) { + setExtentedRanges(set,lset,rset,lweight,rweight); + moveExtentedRange(set,lset,rset); + } + } + + private: + PrimRef* const prims0; + const PrimitiveSplitterFactory& splitterFactory; + const CentGeomBBox3fa& root_info; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h new file mode 100644 index 00000000000..ede0d04c78a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h @@ -0,0 +1,188 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "priminfo.h" +#include "../../common/algorithms/parallel_reduce.h" +#include "../../common/algorithms/parallel_partition.h" + +namespace embree +{ + namespace isa + { + /*! Performs standard object binning */ + struct HeuristicStrandSplit + { + typedef range Set; + + static const size_t PARALLEL_THRESHOLD = 10000; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 4096; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 64; + + /*! stores all information to perform some split */ + struct Split + { + /*! construct an invalid split by default */ + __forceinline Split() + : sah(inf), axis0(zero), axis1(zero) {} + + /*! constructs specified split */ + __forceinline Split(const float sah, const Vec3fa& axis0, const Vec3fa& axis1) + : sah(sah), axis0(axis0), axis1(axis1) {} + + /*! calculates standard surface area heuristic for the split */ + __forceinline float splitSAH() const { return sah; } + + /*! test if this split is valid */ + __forceinline bool valid() const { return sah != float(inf); } + + public: + float sah; //!< SAH cost of the split + Vec3fa axis0, axis1; //!< axis the two strands are aligned into + }; + + __forceinline HeuristicStrandSplit () // FIXME: required? + : scene(nullptr), prims(nullptr) {} + + /*! remember prim array */ + __forceinline HeuristicStrandSplit (Scene* scene, PrimRef* prims) + : scene(scene), prims(prims) {} + + __forceinline const Vec3fa direction(const PrimRef& prim) { + return scene->get(prim.geomID())->computeDirection(prim.primID()); + } + + __forceinline const BBox3fa bounds(const PrimRef& prim) { + return scene->get(prim.geomID())->vbounds(prim.primID()); + } + + __forceinline const BBox3fa bounds(const LinearSpace3fa& space, const PrimRef& prim) { + return scene->get(prim.geomID())->vbounds(space,prim.primID()); + } + + /*! finds the best split */ + const Split find(const range& set, size_t logBlockSize) + { + Vec3fa axis0(0,0,1); + uint64_t bestGeomPrimID = -1; + + /* curve with minimum ID determines first axis */ + for (size_t i=set.begin(); i= bestGeomPrimID) continue; + const Vec3fa axis = direction(prims[i]); + if (sqr_length(axis) > 1E-18f) { + axis0 = normalize(axis); + bestGeomPrimID = geomprimID; + } + } + + /* find 2nd axis that is most misaligned with first axis and has minimum ID */ + float bestCos = 1.0f; + Vec3fa axis1 = axis0; + bestGeomPrimID = -1; + for (size_t i=set.begin(); i cos1) { lnum++; lbounds.extend(bounds(space0,prim)); } + else { rnum++; rbounds.extend(bounds(space1,prim)); } + } + + /*! return an invalid split if we do not partition */ + if (lnum == 0 || rnum == 0) + return Split(inf,axis0,axis1); + + /*! calculate sah for the split */ + const size_t lblocks = (lnum+(1ull<> logBlockSize; + const size_t rblocks = (rnum+(1ull<> logBlockSize; + const float sah = madd(float(lblocks),halfArea(lbounds),float(rblocks)*halfArea(rbounds)); + return Split(sah,axis0,axis1); + } + + /*! array partitioning */ + void split(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + if (!split.valid()) { + deterministic_order(set); + return splitFallback(set,lset,rset); + } + + const size_t begin = set.begin(); + const size_t end = set.end(); + CentGeomBBox3fa local_left(empty); + CentGeomBBox3fa local_right(empty); + + auto primOnLeftSide = [&] (const PrimRef& prim) -> bool { + const Vec3fa axisi = normalize(direction(prim)); + const float cos0 = abs(dot(axisi,split.axis0)); + const float cos1 = abs(dot(axisi,split.axis1)); + return cos0 > cos1; + }; + + auto mergePrimBounds = [this] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { + pinfo.extend(bounds(ref)); + }; + + size_t center = serial_partitioning(prims,begin,end,local_left,local_right,primOnLeftSide,mergePrimBounds); + + new (&lset) PrimInfoRange(begin,center,local_left); + new (&rset) PrimInfoRange(center,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + } + + void deterministic_order(const Set& set) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims[set.begin()],&prims[set.end()]); + } + + void splitFallback(const Set& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + CentGeomBBox3fa left(empty); + for (size_t i=begin; i + struct HeuristicMBlurTemporalSplit + { + typedef BinSplit Split; + typedef mvector* PrimRefVector; + typedef typename PrimRefMB::BBox BBox; + + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + HeuristicMBlurTemporalSplit (MemoryMonitorInterface* device, const RecalculatePrimRef& recalculatePrimRef) + : device(device), recalculatePrimRef(recalculatePrimRef) {} + + struct TemporalBinInfo + { + __forceinline TemporalBinInfo () { + } + + __forceinline TemporalBinInfo (EmptyTy) + { + for (size_t i=0; i= time_range.upper) continue; + const BBox1f dt0(time_range.lower,center_time); + const BBox1f dt1(center_time,time_range.upper); + + /* find linear bounds for both time segments */ + for (size_t i=begin; i& r) -> TemporalBinInfo { + TemporalBinInfo binner(empty); binner.bin(prims, r.begin(), r.end(), time_range, set, recalculatePrimRef); return binner; + }; + *this = parallel_reduce(begin,end,blockSize,TemporalBinInfo(empty),bin,merge2); + } + } + + /*! merges in other binning information */ + __forceinline void merge (const TemporalBinInfo& other) + { + for (size_t i=0; i= time_range.upper) continue; + const BBox1f dt0(time_range.lower,center_time); + const BBox1f dt1(center_time,time_range.upper); + + /* calculate sah */ + const size_t lCount = (count0[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize); + const size_t rCount = (count1[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize); + float sah0 = expectedApproxHalfArea(bounds0[b])*float(lCount)*dt0.size(); + float sah1 = expectedApproxHalfArea(bounds1[b])*float(rCount)*dt1.size(); + if (unlikely(lCount == 0)) sah0 = 0.0f; // happens for initial splits when objects not alive over entire shutter time + if (unlikely(rCount == 0)) sah1 = 0.0f; + const float sah = sah0+sah1; + if (sah < bestSAH) { + bestSAH = sah; + bestPos = center_time; + } + } + return Split(bestSAH*MBLUR_TIME_SPLIT_THRESHOLD,(unsigned)Split::SPLIT_TEMPORAL,0,bestPos); + } + + public: + size_t count0[BINS-1]; + size_t count1[BINS-1]; + BBox bounds0[BINS-1]; + BBox bounds1[BINS-1]; + }; + + /*! finds the best split */ + const Split find(const SetMB& set, const size_t logBlockSize) + { + assert(set.size() > 0); + TemporalBinInfo binner(empty); + binner.bin_parallel(set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,set.time_range,set,recalculatePrimRef); + Split tsplit = binner.best((int)logBlockSize,set.time_range,set); + if (!tsplit.valid()) tsplit.data = Split::SPLIT_FALLBACK; // use fallback split + return tsplit; + } + + __forceinline std::unique_ptr> split(const Split& tsplit, const SetMB& set, SetMB& lset, SetMB& rset) + { + assert(tsplit.sah != float(inf)); + assert(tsplit.fpos > set.time_range.lower); + assert(tsplit.fpos < set.time_range.upper); + + float center_time = tsplit.fpos; + const BBox1f time_range0(set.time_range.lower,center_time); + const BBox1f time_range1(center_time,set.time_range.upper); + mvector& prims = *set.prims; + + /* calculate primrefs for first time range */ + std::unique_ptr> new_vector(new mvector(device, set.size())); + PrimRefVector lprims = new_vector.get(); + + auto reduction_func0 = [&] (const range& r) { + PrimInfoMB pinfo = empty; + for (size_t i=r.begin(); idata(), size_t(0), set.size(), size_t(1024), + [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range0); }); + + lset = SetMB(linfo,lprims,time_range0); + + /* calculate primrefs for second time range */ + auto reduction_func1 = [&] (const range& r) { + PrimInfoMB pinfo = empty; + for (size_t i=r.begin(); i(set.begin(), set.begin() + rinfo.size()); + + /* primrefs for second time range are in prims[set.begin() .. set.end()) */ + /* some primitives may need to be filtered out */ + if (rinfo.size() != set.size()) + rinfo.object_range._end = parallel_filter(prims.data(), set.begin(), set.end(), size_t(1024), + [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range1); }); + + rset = SetMB(rinfo,&prims,time_range1); + + return new_vector; + } + + private: + MemoryMonitorInterface* device; // device to report memory usage to + const RecalculatePrimRef recalculatePrimRef; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/priminfo.h b/thirdparty/embree-aarch64/kernels/builders/priminfo.h new file mode 100644 index 00000000000..06c1388742a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/priminfo.h @@ -0,0 +1,362 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "../common/primref.h" +#include "../common/primref_mb.h" + +namespace embree +{ + // FIXME: maybe there's a better place for this util fct + __forceinline float areaProjectedTriangle(const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2) + { + const Vec3fa e0 = v1-v0; + const Vec3fa e1 = v2-v0; + const Vec3fa d = cross(e0,e1); + return fabs(d.x) + fabs(d.y) + fabs(d.z); + } + + //namespace isa + //{ + template + class CentGeom + { + public: + __forceinline CentGeom () {} + + __forceinline CentGeom (EmptyTy) + : geomBounds(empty), centBounds(empty) {} + + __forceinline CentGeom (const BBox& geomBounds, const BBox3fa& centBounds) + : geomBounds(geomBounds), centBounds(centBounds) {} + + template + __forceinline void extend_primref(const PrimRef& prim) + { + BBox bounds; Vec3fa center; + prim.binBoundsAndCenter(bounds,center); + geomBounds.extend(bounds); + centBounds.extend(center); + } + + template + __forceinline void extend_center2(const PrimRef& prim) + { + BBox3fa bounds = prim.bounds(); + geomBounds.extend(bounds); + centBounds.extend(bounds.center2()); + } + + __forceinline void extend(const BBox& geomBounds_) { + geomBounds.extend(geomBounds_); + centBounds.extend(center2(geomBounds_)); + } + + __forceinline void merge(const CentGeom& other) + { + geomBounds.extend(other.geomBounds); + centBounds.extend(other.centBounds); + } + + static __forceinline const CentGeom merge2(const CentGeom& a, const CentGeom& b) { + CentGeom r = a; r.merge(b); return r; + } + + public: + BBox geomBounds; //!< geometry bounds of primitives + BBox3fa centBounds; //!< centroid bounds of primitives + }; + + typedef CentGeom CentGeomBBox3fa; + + /*! stores bounding information for a set of primitives */ + template + class PrimInfoT : public CentGeom + { + public: + using CentGeom::geomBounds; + using CentGeom::centBounds; + + __forceinline PrimInfoT () {} + + __forceinline PrimInfoT (EmptyTy) + : CentGeom(empty), begin(0), end(0) {} + + __forceinline PrimInfoT (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds) + : CentGeom(centGeomBounds), begin(begin), end(end) {} + + template + __forceinline void add_primref(const PrimRef& prim) + { + CentGeom::extend_primref(prim); + end++; + } + + template + __forceinline void add_center2(const PrimRef& prim) { + CentGeom::extend_center2(prim); + end++; + } + + template + __forceinline void add_center2(const PrimRef& prim, const size_t i) { + CentGeom::extend_center2(prim); + end+=i; + } + + /*__forceinline void add(const BBox& geomBounds_) { + CentGeom::extend(geomBounds_); + end++; + } + + __forceinline void add(const BBox& geomBounds_, const size_t i) { + CentGeom::extend(geomBounds_); + end+=i; + }*/ + + __forceinline void merge(const PrimInfoT& other) + { + CentGeom::merge(other); + begin += other.begin; + end += other.end; + } + + static __forceinline const PrimInfoT merge(const PrimInfoT& a, const PrimInfoT& b) { + PrimInfoT r = a; r.merge(b); return r; + } + + /*! returns the number of primitives */ + __forceinline size_t size() const { + return end-begin; + } + + __forceinline float halfArea() { + return expectedApproxHalfArea(geomBounds); + } + + __forceinline float leafSAH() const { + return expectedApproxHalfArea(geomBounds)*float(size()); + //return halfArea(geomBounds)*blocks(num); + } + + __forceinline float leafSAH(size_t block_shift) const { + return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<> block_shift); + //return halfArea(geomBounds)*float((num+3) >> 2); + //return halfArea(geomBounds)*blocks(num); + } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const PrimInfoT& pinfo) { + return cout << "PrimInfo { begin = " << pinfo.begin << ", end = " << pinfo.end << ", geomBounds = " << pinfo.geomBounds << ", centBounds = " << pinfo.centBounds << "}"; + } + + public: + size_t begin,end; //!< number of primitives + }; + + typedef PrimInfoT PrimInfo; + //typedef PrimInfoT PrimInfoMB; + + /*! stores bounding information for a set of primitives */ + template + class PrimInfoMBT : public CentGeom + { + public: + using CentGeom::geomBounds; + using CentGeom::centBounds; + + __forceinline PrimInfoMBT () { + } + + __forceinline PrimInfoMBT (EmptyTy) + : CentGeom(empty), object_range(0,0), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {} + + __forceinline PrimInfoMBT (size_t begin, size_t end) + : CentGeom(empty), object_range(begin,end), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {} + + template + __forceinline void add_primref(const PrimRef& prim) + { + CentGeom::extend_primref(prim); + time_range.extend(prim.time_range); + object_range._end++; + num_time_segments += prim.size(); + if (max_num_time_segments < prim.totalTimeSegments()) { + max_num_time_segments = prim.totalTimeSegments(); + max_time_range = prim.time_range; + } + } + + __forceinline void merge(const PrimInfoMBT& other) + { + CentGeom::merge(other); + time_range.extend(other.time_range); + object_range._begin += other.object_range.begin(); + object_range._end += other.object_range.end(); + num_time_segments += other.num_time_segments; + if (max_num_time_segments < other.max_num_time_segments) { + max_num_time_segments = other.max_num_time_segments; + max_time_range = other.max_time_range; + } + } + + static __forceinline const PrimInfoMBT merge2(const PrimInfoMBT& a, const PrimInfoMBT& b) { + PrimInfoMBT r = a; r.merge(b); return r; + } + + __forceinline size_t begin() const { + return object_range.begin(); + } + + __forceinline size_t end() const { + return object_range.end(); + } + + /*! returns the number of primitives */ + __forceinline size_t size() const { + return object_range.size(); + } + + __forceinline float halfArea() const { + return time_range.size()*expectedApproxHalfArea(geomBounds); + } + + __forceinline float leafSAH() const { + return time_range.size()*expectedApproxHalfArea(geomBounds)*float(num_time_segments); + } + + __forceinline float leafSAH(size_t block_shift) const { + return time_range.size()*expectedApproxHalfArea(geomBounds)*float((num_time_segments+(size_t(1)<> block_shift); + } + + __forceinline float align_time(float ct) const + { + //return roundf(ct * float(numTimeSegments)) / float(numTimeSegments); + float t0 = (ct-max_time_range.lower)/max_time_range.size(); + float t1 = roundf(t0 * float(max_num_time_segments)) / float(max_num_time_segments); + return t1*max_time_range.size()+max_time_range.lower; + } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const PrimInfoMBT& pinfo) + { + return cout << "PrimInfo { " << + "object_range = " << pinfo.object_range << + ", time_range = " << pinfo.time_range << + ", time_segments = " << pinfo.num_time_segments << + ", geomBounds = " << pinfo.geomBounds << + ", centBounds = " << pinfo.centBounds << + "}"; + } + + public: + range object_range; //!< primitive range + size_t num_time_segments; //!< total number of time segments of all added primrefs + size_t max_num_time_segments; //!< maximum number of time segments of a primitive + BBox1f max_time_range; //!< time range of primitive with max_num_time_segments + BBox1f time_range; //!< merged time range of primitives when merging prims, or additionally clipped with build time range when used in SetMB + }; + + typedef PrimInfoMBT PrimInfoMB; + + struct SetMB : public PrimInfoMB + { + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + typedef mvector* PrimRefVector; + + __forceinline SetMB() {} + + __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims) + : PrimInfoMB(pinfo_i), prims(prims) {} + + __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, range object_range_in, BBox1f time_range_in) + : PrimInfoMB(pinfo_i), prims(prims) + { + object_range = object_range_in; + time_range = intersect(time_range,time_range_in); + } + + __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, BBox1f time_range_in) + : PrimInfoMB(pinfo_i), prims(prims) + { + time_range = intersect(time_range,time_range_in); + } + + void deterministic_order() const + { + /* required as parallel partition destroys original primitive order */ + PrimRefMB* prim = prims->data(); + std::sort(&prim[object_range.begin()],&prim[object_range.end()]); + } + + template + __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef) const + { + auto reduce = [&](const range& r) -> LBBox3fa + { + LBBox3fa cbounds(empty); + for (size_t j = r.begin(); j < r.end(); j++) + { + PrimRefMB& ref = (*prims)[j]; + const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range); + cbounds.extend(bn); + }; + return cbounds; + }; + + return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty), + reduce, + [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); }); + } + + template + __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const + { + auto reduce = [&](const range& r) -> LBBox3fa + { + LBBox3fa cbounds(empty); + for (size_t j = r.begin(); j < r.end(); j++) + { + PrimRefMB& ref = (*prims)[j]; + const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range, space); + cbounds.extend(bn); + }; + return cbounds; + }; + + return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty), + reduce, + [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); }); + } + + template + const SetMB primInfo(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const + { + auto computePrimInfo = [&](const range& r) -> PrimInfoMB + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); j& prims, BuildProgressMonitor& progressMonitor) + { + ParallelPrefixSumState pstate; + + /* first try */ + progressMonitor(0); + PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range& r, const PrimInfo& base) -> PrimInfo { + return geometry->createPrimRefArray(prims,r,r.begin(),geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != prims.size()) + { + progressMonitor(0); + pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range& r, const PrimInfo& base) -> PrimInfo { + return geometry->createPrimRefArray(prims,r,base.size(),geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + return pinfo; + } + + PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, mvector& prims, BuildProgressMonitor& progressMonitor) + { + ParallelForForPrefixSumState pstate; + Scene::Iterator2 iter(scene,types,mblur); + + /* first try */ + progressMonitor(0); + pstate.init(iter,size_t(1024)); + PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID) -> PrimInfo { + return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != prims.size()) + { + progressMonitor(0); + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + return pinfo; + } + + PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, mvector& prims, BuildProgressMonitor& progressMonitor, size_t itime) + { + ParallelForForPrefixSumState pstate; + Scene::Iterator2 iter(scene,types,true); + + /* first try */ + progressMonitor(0); + pstate.init(iter,size_t(1024)); + PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID) -> PrimInfo { + return mesh->createPrimRefArrayMB(prims,itime,r,k,(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != prims.size()) + { + progressMonitor(0); + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + return mesh->createPrimRefArrayMB(prims,itime,r,base.size(),(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + return pinfo; + } + + PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, mvector& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1) + { + ParallelForForPrefixSumState pstate; + Scene::Iterator2 iter(scene,types,true); + + /* first try */ + progressMonitor(0); + pstate.init(iter,size_t(1024)); + PrimInfoMB pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID) -> PrimInfoMB { + return mesh->createPrimRefMBArray(prims,t0t1,r,k,(unsigned)geomID); + }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != prims.size()) + { + progressMonitor(0); + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB { + return mesh->createPrimRefMBArray(prims,t0t1,r,base.size(),(unsigned)geomID); + }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); + } + + /* the BVH starts with that time range, even though primitives might have smaller/larger time range */ + pinfo.time_range = t0t1; + return pinfo; + } + + template + size_t createMortonCodeArray(Mesh* mesh, mvector& morton, BuildProgressMonitor& progressMonitor) + { + size_t numPrimitives = morton.size(); + + /* compute scene bounds */ + std::pair cb_empty(0,empty); + auto cb = parallel_reduce + ( size_t(0), numPrimitives, size_t(1024), cb_empty, [&](const range& r) -> std::pair + { + size_t num = 0; + BBox3fa bounds = empty; + + for (size_t j=r.begin(); jbuildBounds(j,&prim_bounds))) continue; + bounds.extend(center2(prim_bounds)); + num++; + } + return std::make_pair(num,bounds); + }, [] (const std::pair& a, const std::pair& b) { + return std::make_pair(a.first + b.first,merge(a.second,b.second)); + }); + + + size_t numPrimitivesGen = cb.first; + const BBox3fa centBounds = cb.second; + + /* compute morton codes */ + if (likely(numPrimitivesGen == numPrimitives)) + { + /* fast path if all primitives were valid */ + BVHBuilderMorton::MortonCodeMapping mapping(centBounds); + parallel_for( size_t(0), numPrimitives, size_t(1024), [&](const range& r) -> void { + BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]); + for (size_t j=r.begin(); jbounds(j),unsigned(j)); + }); + } + else + { + /* slow path, fallback in case some primitives were invalid */ + ParallelPrefixSumState pstate; + BVHBuilderMorton::MortonCodeMapping mapping(centBounds); + parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range& r, const size_t base) -> size_t { + size_t num = 0; + BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]); + for (size_t j=r.begin(); jbuildBounds(j,&bounds))) continue; + generator(bounds,unsigned(j)); + num++; + } + return num; + }, std::plus()); + + parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range& r, const size_t base) -> size_t { + size_t num = 0; + BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[base]); + for (size_t j=r.begin(); jbuildBounds(j,&bounds)) continue; + generator(bounds,unsigned(j)); + num++; + } + return num; + }, std::plus()); + } + return numPrimitivesGen; + } + + // ==================================================================================================== + // ==================================================================================================== + // ==================================================================================================== + + // template for grid meshes + +#if 0 + template<> + PrimInfo createPrimRefArray(Scene* scene, mvector& prims, BuildProgressMonitor& progressMonitor) + { + PING; + ParallelForForPrefixSumState pstate; + Scene::Iterator iter(scene); + + /* first try */ + progressMonitor(0); + pstate.init(iter,size_t(1024)); + PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range& r, size_t k) -> PrimInfo + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); jbuildBounds(j,&bounds)) continue; + const PrimRef prim(bounds,mesh->geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != prims.size()) + { + progressMonitor(0); + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range& r, size_t k, const PrimInfo& base) -> PrimInfo + { + k = base.size(); + PrimInfo pinfo(empty); + for (size_t j=r.begin(); jbuildBounds(j,&bounds)) continue; + const PrimRef prim(bounds,mesh->geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + return pinfo; + } +#endif + + // ==================================================================================================== + // ==================================================================================================== + // ==================================================================================================== + + IF_ENABLED_TRIS (template size_t createMortonCodeArray(TriangleMesh* mesh COMMA mvector& morton COMMA BuildProgressMonitor& progressMonitor)); + IF_ENABLED_QUADS(template size_t createMortonCodeArray(QuadMesh* mesh COMMA mvector& morton COMMA BuildProgressMonitor& progressMonitor)); + IF_ENABLED_USER (template size_t createMortonCodeArray(UserGeometry* mesh COMMA mvector& morton COMMA BuildProgressMonitor& progressMonitor)); + IF_ENABLED_INSTANCE (template size_t createMortonCodeArray(Instance* mesh COMMA mvector& morton COMMA BuildProgressMonitor& progressMonitor)); + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen.h b/thirdparty/embree-aarch64/kernels/builders/primrefgen.h new file mode 100644 index 00000000000..9919c945c31 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen.h @@ -0,0 +1,28 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/scene.h" +#include "../common/primref.h" +#include "../common/primref_mb.h" +#include "priminfo.h" +#include "bvh_builder_morton.h" + +namespace embree +{ + namespace isa + { + PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, mvector& prims, BuildProgressMonitor& progressMonitor); + + PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, mvector& prims, BuildProgressMonitor& progressMonitor); + + PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, mvector& prims, BuildProgressMonitor& progressMonitor, size_t itime = 0); + + PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, mvector& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f)); + + template + size_t createMortonCodeArray(Mesh* mesh, mvector& morton, BuildProgressMonitor& progressMonitor); + } +} + diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h b/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h new file mode 100644 index 00000000000..8bdb38b955f --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h @@ -0,0 +1,371 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../builders/primrefgen.h" +#include "../builders/heuristic_spatial.h" +#include "../builders/splitter.h" + +#include "../../common/algorithms/parallel_for_for.h" +#include "../../common/algorithms/parallel_for_for_prefix_sum.h" + +#define DBG_PRESPLIT(x) +#define CHECK_PRESPLIT(x) + +#define GRID_SIZE 1024 +#define MAX_PRESPLITS_PER_PRIMITIVE_LOG 5 +#define MAX_PRESPLITS_PER_PRIMITIVE (1<(priority); + } + __forceinline bool operator < (const PresplitItem& item) const + { + return (priority < item.priority); + } + + template + __forceinline static float compute_priority(const PrimRef &ref, Scene *scene, const Vec2i &mc) + { + const unsigned int geomID = ref.geomID(); + const unsigned int primID = ref.primID(); + const float area_aabb = area(ref.bounds()); + const float area_prim = ((Mesh*)scene->get(geomID))->projectedPrimitiveArea(primID); + const unsigned int diff = 31 - lzcnt(mc.x^mc.y); + assert(area_prim <= area_aabb); + //const float priority = powf((area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f); + const float priority = sqrtf(sqrtf( (area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff) )); + assert(priority >= 0.0f && priority < FLT_LARGE); + return priority; + } + + + }; + + inline std::ostream &operator<<(std::ostream &cout, const PresplitItem& item) { + return cout << "index " << item.index << " priority " << item.priority; + }; + + template + void splitPrimitive(SplitterFactory &Splitter, + const PrimRef &prim, + const unsigned int geomID, + const unsigned int primID, + const unsigned int split_level, + const Vec3fa &grid_base, + const float grid_scale, + const float grid_extend, + PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE], + unsigned int& numSubPrims) + { + assert(split_level <= MAX_PRESPLITS_PER_PRIMITIVE_LOG); + if (split_level == 0) + { + assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE); + subPrims[numSubPrims++] = prim; + } + else + { + const Vec3fa lower = prim.lower; + const Vec3fa upper = prim.upper; + const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f); + const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f); + Vec3ia ilower(floor(glower)); + Vec3ia iupper(floor(gupper)); + + /* this ignores dimensions that are empty */ + iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper))); + + /* compute a morton code for the lower and upper grid coordinates. */ + const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z); + const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z); + + /* if all bits are equal then we cannot split */ + if(unlikely(lower_code == upper_code)) + { + assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE); + subPrims[numSubPrims++] = prim; + return; + } + + /* compute octree level and dimension to perform the split in */ + const unsigned int diff = 31 - lzcnt(lower_code^upper_code); + const unsigned int level = diff / 3; + const unsigned int dim = diff % 3; + + /* now we compute the grid position of the split */ + const unsigned int isplit = iupper[dim] & ~((1<= fsplit); + + /* split primitive */ + const auto splitter = Splitter(prim); + BBox3fa left,right; + splitter(prim.bounds(),dim,fsplit,left,right); + assert(!left.empty()); + assert(!right.empty()); + + + splitPrimitive(Splitter,PrimRef(left ,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); + splitPrimitive(Splitter,PrimRef(right,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); + } + } + + + template + PrimInfo createPrimRefArray_presplit(Geometry* geometry, unsigned int geomID, size_t numPrimRefs, mvector& prims, BuildProgressMonitor& progressMonitor) + { + ParallelPrefixSumState pstate; + + /* first try */ + progressMonitor(0); + PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range& r, const PrimInfo& base) -> PrimInfo { + return geometry->createPrimRefArray(prims,r,r.begin(),geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != numPrimRefs) + { + progressMonitor(0); + pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range& r, const PrimInfo& base) -> PrimInfo { + return geometry->createPrimRefArray(prims,r,base.size(),geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + return pinfo; + } + + __forceinline Vec2i computeMC(const Vec3fa &grid_base, const float grid_scale, const PrimRef &ref) + { + const Vec3fa lower = ref.lower; + const Vec3fa upper = ref.upper; + const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f); + const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f); + Vec3ia ilower(floor(glower)); + Vec3ia iupper(floor(gupper)); + + /* this ignores dimensions that are empty */ + iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper)); + + /* compute a morton code for the lower and upper grid coordinates. */ + const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z); + const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z); + return Vec2i(lower_code,upper_code); + } + + template + PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector& prims, BuildProgressMonitor& progressMonitor) + { + static const size_t MIN_STEP_SIZE = 128; + + ParallelForForPrefixSumState pstate; + Scene::Iterator2 iter(scene,types,mblur); + + /* first try */ + progressMonitor(0); + pstate.init(iter,size_t(1024)); + PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID) -> PrimInfo { + return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != numPrimRefs) + { + progressMonitor(0); + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + + /* use correct number of primitives */ + size_t numPrimitives = pinfo.size(); + const size_t alloc_numPrimitives = prims.size(); + const size_t numSplitPrimitivesBudget = alloc_numPrimitives - numPrimitives; + + /* set up primitive splitter */ + SplitterFactory Splitter(scene); + + + DBG_PRESPLIT( + const size_t org_numPrimitives = pinfo.size(); + PRINT(numPrimitives); + PRINT(alloc_numPrimitives); + PRINT(numSplitPrimitivesBudget); + ); + + /* allocate double buffer presplit items */ + const size_t presplit_allocation_size = sizeof(PresplitItem)*alloc_numPrimitives; + PresplitItem *presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64); + PresplitItem *tmp_presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64); + + /* compute grid */ + const Vec3fa grid_base = pinfo.geomBounds.lower; + const Vec3fa grid_diag = pinfo.geomBounds.size(); + const float grid_extend = max(grid_diag.x,max(grid_diag.y,grid_diag.z)); + const float grid_scale = grid_extend == 0.0f ? 0.0f : GRID_SIZE / grid_extend; + + /* init presplit items and get total sum */ + const float psum = parallel_reduce( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), 0.0f, [&](const range& r) -> float { + float sum = 0.0f; + for (size_t i=r.begin(); i(prims[i],scene,mc) : 0.0f; + /* FIXME: sum undeterministic */ + sum += presplitItem[i].priority; + } + return sum; + },[](const float& a, const float& b) -> float { return a+b; }); + + /* compute number of splits per primitive */ + const float inv_psum = 1.0f / psum; + parallel_for( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range& r) -> void { + for (size_t i=r.begin(); i 0.0f) + { + const float rel_p = (float)numSplitPrimitivesBudget * presplitItem[i].priority * inv_psum; + if (rel_p >= PRIORITY_CUTOFF_THRESHOLD) // need at least a split budget that generates two sub-prims + { + presplitItem[i].priority = max(min(ceilf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),1.0f); + //presplitItem[i].priority = min(floorf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG); + assert(presplitItem[i].priority >= 0.0f && presplitItem[i].priority <= (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG); + } + else + presplitItem[i].priority = 0.0f; + } + } + }); + + auto isLeft = [&] (const PresplitItem &ref) { return ref.priority < PRIORITY_CUTOFF_THRESHOLD; }; + size_t center = parallel_partitioning(presplitItem,0,numPrimitives,isLeft,1024); + + /* anything to split ? */ + if (center < numPrimitives) + { + const size_t numPrimitivesToSplit = numPrimitives - center; + assert(presplitItem[center].priority >= 1.0f); + + /* sort presplit items in ascending order */ + radix_sort_u32(presplitItem + center,tmp_presplitItem + center,numPrimitivesToSplit,1024); + + CHECK_PRESPLIT( + parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range& r) -> void { + for (size_t i=r.begin(); i& t) -> size_t { + size_t sum = 0; + for (size_t i=t.begin(); i= 1.0f); + const unsigned int primrefID = presplitItem[i].index; + const float prio = presplitItem[i].priority; + const unsigned int geomID = prims[primrefID].geomID(); + const unsigned int primID = prims[primrefID].primID(); + const unsigned int split_levels = (unsigned int)prio; + unsigned int numSubPrims = 0; + splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); + assert(numSubPrims); + numSubPrims--; // can reuse slot + sum+=numSubPrims; + presplitItem[i].data = (numSubPrims << MAX_PRESPLITS_PER_PRIMITIVE_LOG) | split_levels; + primOffset0[i-center] = numSubPrims; + } + return sum; + },[](const size_t& a, const size_t& b) -> size_t { return a+b; }); + + /* if we are over budget, need to shrink the range */ + if (totalNumSubPrims > numSplitPrimitivesBudget) + { + size_t new_center = numPrimitives-1; + size_t sum = 0; + for (;new_center>=center;new_center--) + { + const unsigned int numSubPrims = presplitItem[new_center].data >> MAX_PRESPLITS_PER_PRIMITIVE_LOG; + if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break; + sum += numSubPrims; + } + new_center++; + center = new_center; + } + + /* parallel prefix sum to compute offsets for storing sub-primitives */ + const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus()); + + /* iterate over range, and split primitives into sub primitives and append them to prims array */ + parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range& rn) -> void { + for (size_t j=rn.begin(); j& r) -> PrimInfo { + PrimInfo p(empty); + for (size_t j=r.begin(); j PrimInfo { return PrimInfo::merge(a,b); }); + + assert(pinfo.size() == numPrimitives); + + /* free double buffer presplit items */ + alignedFree(tmp_presplitItem); + alignedFree(presplitItem); + return pinfo; + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/splitter.h b/thirdparty/embree-aarch64/kernels/builders/splitter.h new file mode 100644 index 00000000000..dbd6cf07c7b --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/splitter.h @@ -0,0 +1,169 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/scene.h" +#include "../common/primref.h" + +namespace embree +{ + namespace isa + { + template + __forceinline void splitPolygon(const BBox3fa& bounds, + const size_t dim, + const float pos, + const Vec3fa (&v)[N+1], + const Vec3fa (&inv_length)[N], + BBox3fa& left_o, + BBox3fa& right_o) + { + BBox3fa left = empty, right = empty; + /* clip triangle to left and right box by processing all edges */ + for (size_t i=0; i= pos) right.extend(v0); // this point is on right side + + if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location + { + assert((v1d-v0d) != 0.0f); + const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length[i][dim]),v1-v0,v0); + left.extend(c); + right.extend(c); + } + } + + /* clip against current bounds */ + left_o = intersect(left,bounds); + right_o = intersect(right,bounds); + } + + template + __forceinline void splitPolygon(const PrimRef& prim, + const size_t dim, + const float pos, + const Vec3fa (&v)[N+1], + PrimRef& left_o, + PrimRef& right_o) + { + BBox3fa left = empty, right = empty; + for (size_t i=0; i= pos) right.extend(v0); // this point is on right side + + if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location + { + assert((v1d-v0d) != 0.0f); + const float inv_length = 1.0f/(v1d-v0d); + const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length),v1-v0,v0); + left.extend(c); + right.extend(c); + } + } + + /* clip against current bounds */ + new (&left_o ) PrimRef(intersect(left ,prim.bounds()),prim.geomID(), prim.primID()); + new (&right_o) PrimRef(intersect(right,prim.bounds()),prim.geomID(), prim.primID()); + } + + struct TriangleSplitter + { + __forceinline TriangleSplitter(const Scene* scene, const PrimRef& prim) + { + const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; + const TriangleMesh* mesh = (const TriangleMesh*) scene->get(prim.geomID() & mask ); + TriangleMesh::Triangle tri = mesh->triangle(prim.primID()); + v[0] = mesh->vertex(tri.v[0]); + v[1] = mesh->vertex(tri.v[1]); + v[2] = mesh->vertex(tri.v[2]); + v[3] = mesh->vertex(tri.v[0]); + inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]); + inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]); + inv_length[2] = Vec3fa(1.0f) / (v[0]-v[2]); + } + + __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const { + splitPolygon<3>(prim,dim,pos,v,left_o,right_o); + } + + __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const { + splitPolygon<3>(prim,dim,pos,v,inv_length,left_o,right_o); + } + + private: + Vec3fa v[4]; + Vec3fa inv_length[3]; + }; + + struct TriangleSplitterFactory + { + __forceinline TriangleSplitterFactory(const Scene* scene) + : scene(scene) {} + + __forceinline TriangleSplitter operator() (const PrimRef& prim) const { + return TriangleSplitter(scene,prim); + } + + private: + const Scene* scene; + }; + + struct QuadSplitter + { + __forceinline QuadSplitter(const Scene* scene, const PrimRef& prim) + { + const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; + const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask ); + QuadMesh::Quad quad = mesh->quad(prim.primID()); + v[0] = mesh->vertex(quad.v[0]); + v[1] = mesh->vertex(quad.v[1]); + v[2] = mesh->vertex(quad.v[2]); + v[3] = mesh->vertex(quad.v[3]); + v[4] = mesh->vertex(quad.v[0]); + inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]); + inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]); + inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]); + inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]); + } + + __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const { + splitPolygon<4>(prim,dim,pos,v,left_o,right_o); + } + + __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const { + splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o); + } + + private: + Vec3fa v[5]; + Vec3fa inv_length[4]; + }; + + struct QuadSplitterFactory + { + __forceinline QuadSplitterFactory(const Scene* scene) + : scene(scene) {} + + __forceinline QuadSplitter operator() (const PrimRef& prim) const { + return QuadSplitter(scene,prim); + } + + private: + const Scene* scene; + }; + } +} + diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp new file mode 100644 index 00000000000..bd102bd6ef5 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp @@ -0,0 +1,190 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_statistics.h" + +namespace embree +{ + template + BVHN::BVHN (const PrimitiveType& primTy, Scene* scene) + : AccelData((N==4) ? AccelData::TY_BVH4 : (N==8) ? AccelData::TY_BVH8 : AccelData::TY_UNKNOWN), + primTy(&primTy), device(scene->device), scene(scene), + root(emptyNode), alloc(scene->device,scene->isStaticAccel()), numPrimitives(0), numVertices(0) + { + } + + template + BVHN::~BVHN () + { + for (size_t i=0; i + void BVHN::clear() + { + set(BVHN::emptyNode,empty,0); + alloc.clear(); + } + + template + void BVHN::set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives) + { + this->root = root; + this->bounds = bounds; + this->numPrimitives = numPrimitives; + } + + template + void BVHN::clearBarrier(NodeRef& node) + { + if (node.isBarrier()) + node.clearBarrier(); + else if (!node.isLeaf()) { + BaseNode* n = node.baseNode(); // FIXME: flags should be stored in BVH + for (size_t c=0; cchild(c)); + } + } + + template + void BVHN::layoutLargeNodes(size_t num) + { +#if defined(__X86_64__) || defined(__aarch64__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues + struct NodeArea + { + __forceinline NodeArea() {} + + __forceinline NodeArea(NodeRef& node, const BBox3fa& bounds) + : node(&node), A(node.isLeaf() ? float(neg_inf) : area(bounds)) {} + + __forceinline bool operator< (const NodeArea& other) const { + return this->A < other.A; + } + + NodeRef* node; + float A; + }; + std::vector lst; + lst.reserve(num); + lst.push_back(NodeArea(root,empty)); + + while (lst.size() < num) + { + std::pop_heap(lst.begin(), lst.end()); + NodeArea n = lst.back(); lst.pop_back(); + if (!n.node->isAABBNode()) break; + AABBNode* node = n.node->getAABBNode(); + for (size_t i=0; ichild(i) == BVHN::emptyNode) continue; + lst.push_back(NodeArea(node->child(i),node->bounds(i))); + std::push_heap(lst.begin(), lst.end()); + } + } + + for (size_t i=0; isetBarrier(); + + root = layoutLargeNodesRecursion(root,alloc.getCachedAllocator()); +#endif + } + + template + typename BVHN::NodeRef BVHN::layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator) + { + if (node.isBarrier()) { + node.clearBarrier(); + return node; + } + else if (node.isAABBNode()) + { + AABBNode* oldnode = node.getAABBNode(); + AABBNode* newnode = (BVHN::AABBNode*) allocator.malloc0(sizeof(BVHN::AABBNode),byteNodeAlignment); + *newnode = *oldnode; + for (size_t c=0; cchild(c) = layoutLargeNodesRecursion(oldnode->child(c),allocator); + return encodeNode(newnode); + } + else return node; + } + + template + double BVHN::preBuild(const std::string& builderName) + { + if (builderName == "") + return inf; + + if (device->verbosity(2)) + { + Lock lock(g_printMutex); + std::cout << "building BVH" << N << (builderName.find("MBlur") != std::string::npos ? "MB" : "") << "<" << primTy->name() << "> using " << builderName << " ..." << std::endl << std::flush; + } + + double t0 = 0.0; + if (device->benchmark || device->verbosity(2)) t0 = getSeconds(); + return t0; + } + + template + void BVHN::postBuild(double t0) + { + if (t0 == double(inf)) + return; + + double dt = 0.0; + if (device->benchmark || device->verbosity(2)) + dt = getSeconds()-t0; + + std::unique_ptr> stat; + + /* print statistics */ + if (device->verbosity(2)) + { + if (!stat) stat.reset(new BVHNStatistics(this)); + const size_t usedBytes = alloc.getUsedBytes(); + Lock lock(g_printMutex); + std::cout << "finished BVH" << N << "<" << primTy->name() << "> : " << 1000.0f*dt << "ms, " << 1E-6*double(numPrimitives)/dt << " Mprim/s, " << 1E-9*double(usedBytes)/dt << " GB/s" << std::endl; + + if (device->verbosity(2)) + std::cout << stat->str(); + + if (device->verbosity(2)) + { + FastAllocator::AllStatistics stat(&alloc); + for (size_t i=0; ialloc); + + stat.print(numPrimitives); + } + + if (device->verbosity(3)) + { + alloc.print_blocks(); + for (size_t i=0; ialloc.print_blocks(); + } + + std::cout << std::flush; + } + + /* benchmark mode */ + if (device->benchmark) + { + if (!stat) stat.reset(new BVHNStatistics(this)); + Lock lock(g_printMutex); + std::cout << "BENCHMARK_BUILD " << dt << " " << double(numPrimitives)/dt << " " << stat->sah() << " " << stat->bytesUsed() << " BVH" << N << "<" << primTy->name() << ">" << std::endl << std::flush; + } + } + +#if defined(__AVX__) + template class BVHN<8>; +#endif + +#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__) + template class BVHN<4>; +#endif +} + diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh.h b/thirdparty/embree-aarch64/kernels/bvh/bvh.h new file mode 100644 index 00000000000..8fdf912e520 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh.h @@ -0,0 +1,235 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +/* include all node types */ +#include "bvh_node_aabb.h" +#include "bvh_node_aabb_mb.h" +#include "bvh_node_aabb_mb4d.h" +#include "bvh_node_obb.h" +#include "bvh_node_obb_mb.h" +#include "bvh_node_qaabb.h" + +namespace embree +{ + /*! flags used to enable specific node types in intersectors */ + enum BVHNodeFlags + { + BVH_FLAG_ALIGNED_NODE = 0x00001, + BVH_FLAG_ALIGNED_NODE_MB = 0x00010, + BVH_FLAG_UNALIGNED_NODE = 0x00100, + BVH_FLAG_UNALIGNED_NODE_MB = 0x01000, + BVH_FLAG_QUANTIZED_NODE = 0x100000, + BVH_FLAG_ALIGNED_NODE_MB4D = 0x1000000, + + /* short versions */ + BVH_AN1 = BVH_FLAG_ALIGNED_NODE, + BVH_AN2 = BVH_FLAG_ALIGNED_NODE_MB, + BVH_AN2_AN4D = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D, + BVH_UN1 = BVH_FLAG_UNALIGNED_NODE, + BVH_UN2 = BVH_FLAG_UNALIGNED_NODE_MB, + BVH_MB = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D, + BVH_AN1_UN1 = BVH_FLAG_ALIGNED_NODE | BVH_FLAG_UNALIGNED_NODE, + BVH_AN2_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB, + BVH_AN2_AN4D_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D | BVH_FLAG_UNALIGNED_NODE_MB, + BVH_QN1 = BVH_FLAG_QUANTIZED_NODE + }; + + /*! Multi BVH with N children. Each node stores the bounding box of + * it's N children as well as N child references. */ + template + class BVHN : public AccelData + { + ALIGNED_CLASS_(16); + public: + + /*! forward declaration of node ref type */ + typedef NodeRefPtr NodeRef; + typedef BaseNode_t BaseNode; + typedef AABBNode_t AABBNode; + typedef AABBNodeMB_t AABBNodeMB; + typedef AABBNodeMB4D_t AABBNodeMB4D; + typedef OBBNode_t OBBNode; + typedef OBBNodeMB_t OBBNodeMB; + typedef QuantizedBaseNode_t QuantizedBaseNode; + typedef QuantizedBaseNodeMB_t QuantizedBaseNodeMB; + typedef QuantizedNode_t QuantizedNode; + + /*! Number of bytes the nodes and primitives are minimally aligned to.*/ + static const size_t byteAlignment = 16; + static const size_t byteNodeAlignment = 4*N; + + /*! Empty node */ + static const size_t emptyNode = NodeRef::emptyNode; + + /*! Invalid node, used as marker in traversal */ + static const size_t invalidNode = NodeRef::invalidNode; + static const size_t popRay = NodeRef::popRay; + + /*! Maximum depth of the BVH. */ + static const size_t maxBuildDepth = 32; + static const size_t maxBuildDepthLeaf = maxBuildDepth+8; + static const size_t maxDepth = 2*maxBuildDepthLeaf; // 2x because of two level builder + + /*! Maximum number of primitive blocks in a leaf. */ + static const size_t maxLeafBlocks = NodeRef::maxLeafBlocks; + + public: + + /*! Builder interface to create allocator */ + struct CreateAlloc : public FastAllocator::Create { + __forceinline CreateAlloc (BVHN* bvh) : FastAllocator::Create(&bvh->alloc) {} + }; + + typedef BVHNodeRecord NodeRecord; + typedef BVHNodeRecordMB NodeRecordMB; + typedef BVHNodeRecordMB4D NodeRecordMB4D; + + public: + + /*! BVHN default constructor. */ + BVHN (const PrimitiveType& primTy, Scene* scene); + + /*! BVHN destruction */ + ~BVHN (); + + /*! clears the acceleration structure */ + void clear(); + + /*! sets BVH members after build */ + void set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives); + + /*! Clears the barrier bits of a subtree. */ + void clearBarrier(NodeRef& node); + + /*! lays out num large nodes of the BVH */ + void layoutLargeNodes(size_t num); + NodeRef layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator); + + /*! called by all builders before build starts */ + double preBuild(const std::string& builderName); + + /*! called by all builders after build ended */ + void postBuild(double t0); + + /*! allocator class */ + struct Allocator { + BVHN* bvh; + Allocator (BVHN* bvh) : bvh(bvh) {} + __forceinline void* operator() (size_t bytes) const { + return bvh->alloc._threadLocal()->malloc(&bvh->alloc,bytes); + } + }; + + /*! post build cleanup */ + void cleanup() { + alloc.cleanup(); + } + + public: + + /*! Encodes a node */ + static __forceinline NodeRef encodeNode(AABBNode* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeNode(AABBNodeMB* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeNode(AABBNodeMB4D* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeNode(OBBNode* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeNode(OBBNodeMB* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeLeaf(void* tri, size_t num) { return NodeRef::encodeLeaf(tri,num); } + static __forceinline NodeRef encodeTypedLeaf(void* ptr, size_t ty) { return NodeRef::encodeTypedLeaf(ptr,ty); } + + public: + + /*! Prefetches the node this reference points to */ + __forceinline static void prefetch(const NodeRef ref, int types=0) + { +#if defined(__AVX512PF__) // MIC + if (types != BVH_FLAG_QUANTIZED_NODE) { + prefetchL2(((char*)ref.ptr)+0*64); + prefetchL2(((char*)ref.ptr)+1*64); + if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) { + prefetchL2(((char*)ref.ptr)+2*64); + prefetchL2(((char*)ref.ptr)+3*64); + } + if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) { + /* KNL still needs L2 prefetches for large nodes */ + prefetchL2(((char*)ref.ptr)+4*64); + prefetchL2(((char*)ref.ptr)+5*64); + prefetchL2(((char*)ref.ptr)+6*64); + prefetchL2(((char*)ref.ptr)+7*64); + } + } + else + { + /* todo: reduce if 32bit offsets are enabled */ + prefetchL2(((char*)ref.ptr)+0*64); + prefetchL2(((char*)ref.ptr)+1*64); + prefetchL2(((char*)ref.ptr)+2*64); + } +#else + if (types != BVH_FLAG_QUANTIZED_NODE) { + prefetchL1(((char*)ref.ptr)+0*64); + prefetchL1(((char*)ref.ptr)+1*64); + if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) { + prefetchL1(((char*)ref.ptr)+2*64); + prefetchL1(((char*)ref.ptr)+3*64); + } + if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) { + /* deactivate for large nodes on Xeon, as it introduces regressions */ + //prefetchL1(((char*)ref.ptr)+4*64); + //prefetchL1(((char*)ref.ptr)+5*64); + //prefetchL1(((char*)ref.ptr)+6*64); + //prefetchL1(((char*)ref.ptr)+7*64); + } + } + else + { + /* todo: reduce if 32bit offsets are enabled */ + prefetchL1(((char*)ref.ptr)+0*64); + prefetchL1(((char*)ref.ptr)+1*64); + prefetchL1(((char*)ref.ptr)+2*64); + } +#endif + } + + __forceinline static void prefetchW(const NodeRef ref, int types=0) + { + embree::prefetchEX(((char*)ref.ptr)+0*64); + embree::prefetchEX(((char*)ref.ptr)+1*64); + if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) { + embree::prefetchEX(((char*)ref.ptr)+2*64); + embree::prefetchEX(((char*)ref.ptr)+3*64); + } + if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) { + embree::prefetchEX(((char*)ref.ptr)+4*64); + embree::prefetchEX(((char*)ref.ptr)+5*64); + embree::prefetchEX(((char*)ref.ptr)+6*64); + embree::prefetchEX(((char*)ref.ptr)+7*64); + } + } + + /*! bvh type information */ + public: + const PrimitiveType* primTy; //!< primitive type stored in the BVH + + /*! bvh data */ + public: + Device* device; //!< device pointer + Scene* scene; //!< scene pointer + NodeRef root; //!< root node + FastAllocator alloc; //!< allocator used to allocate nodes + + /*! statistics data */ + public: + size_t numPrimitives; //!< number of primitives the BVH is build over + size_t numVertices; //!< number of vertices the BVH references + + /*! data arrays for special builders */ + public: + std::vector objects; + vector_t> subdiv_patches; + }; + + typedef BVHN<4> BVH4; + typedef BVHN<8> BVH8; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp new file mode 100644 index 00000000000..23f4f63d45f --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp @@ -0,0 +1,1325 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh4_factory.h" +#include "../bvh/bvh.h" + +#include "../geometry/curveNv.h" +#include "../geometry/curveNi.h" +#include "../geometry/curveNi_mb.h" +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/subdivpatch1.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" +#include "../common/accelinstance.h" + +namespace embree +{ + DECLARE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom); + + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4i,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8i,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4v,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4iMB,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4IntersectorStreamPacketFallback); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoellerNoFilter); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4vIntersectorStreamPluecker); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoellerNoFilter); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamPluecker); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream); + + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + + DECLARE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + + BVH4Factory::BVH4Factory(int bfeatures, int ifeatures) + { + SELECT_SYMBOL_DEFAULT_AVX_AVX2(ifeatures,BVH4ColliderUserGeom); + + selectBuilders(bfeatures); + selectIntersectors(ifeatures); + } + + void BVH4Factory::selectBuilders(int features) + { + IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4MeshSAH)); + IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4iMeshSAH)); + IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4vMeshSAH)); + IF_ENABLED_QUADS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelQuadMeshSAH)); + IF_ENABLED_USER (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelVirtualSAH)); + IF_ENABLED_INSTANCE (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelInstanceSAH)); + + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4vBuilder_OBB_New)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4iBuilder_OBB_New)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4OBBCurve4iMBBuilder_OBB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4Curve8iBuilder_OBB_New)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4OBBCurve8iMBBuilder_OBB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4SceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4vSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4iSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iMBSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vMBSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedTriangle4iSceneBuilderSAH)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Quad4vSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Quad4iSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4iMBSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedQuad4iSceneBuilderSAH)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4SceneBuilderFastSpatialSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vSceneBuilderFastSpatialSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iSceneBuilderFastSpatialSAH)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4vSceneBuilderFastSpatialSAH)); + + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4VirtualSceneBuilderSAH)); + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4VirtualMBSceneBuilderSAH)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4InstanceSceneBuilderSAH)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceMBSceneBuilderSAH)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridSceneBuilderSAH)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridMBSceneBuilderSAH)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4SubdivPatch1BuilderSAH)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4SubdivPatch1MBBuilderSAH)); + } + + void BVH4Factory::selectIntersectors(int features) + { + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4i)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8i)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4v)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8v)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4iMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8iMB)); + + /* select intersectors1 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector1)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector1MB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust1)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust1MB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4iIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4vIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4iIntersector1Pluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector1Pluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector1Pluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector1Pluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector1Pluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector1Moeller)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,QBVH4Triangle4iIntersector1Pluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,QBVH4Quad4iIntersector1Pluecker)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector1)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector1)); + + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector1)); + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector1)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector1)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector1)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector1Moeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector1Moeller)) + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector1Pluecker)); + +#if defined (EMBREE_RAY_PACKETS) + + /* select intersectors4 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector4Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector4HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust4Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust4HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector4HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vIntersector4HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector4HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector4HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector4HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector4HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector4HybridPluecker)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector4)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector4)); + + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector4Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector4Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector4Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector4Chunk)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoeller)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector4HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector4HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector4HybridPluecker)); + + /* select intersectors8 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector8Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector8HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust8Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust8HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector8HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vIntersector8HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector8HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector8HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector8HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector8HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector8HybridPluecker)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector8)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector8)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector8Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector8Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector8Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector8Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector8HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector8HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector8HybridPluecker)); + + /* select intersectors16 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersector16Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersector16HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust16Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust16HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector16HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vIntersector16HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersector16HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vMBIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iMBIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vMBIntersector16HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iMBIntersector16HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersector16HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iMBIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iMBIntersector16HybridPluecker)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4SubdivPatch1Intersector16)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4SubdivPatch1MBIntersector16)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4VirtualIntersector16Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4VirtualMBIntersector16Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4InstanceIntersector16Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4InstanceMBIntersector16Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridIntersector16HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridMBIntersector16HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridIntersector16HybridPluecker)); + + /* select stream intersectors */ + SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4IntersectorStreamPacketFallback); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4IntersectorStreamMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4IntersectorStreamMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersectorStreamMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4vIntersectorStreamPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersectorStreamPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersectorStreamMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersectorStreamPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4VirtualIntersectorStream)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4InstanceIntersectorStream)); + +#endif + } + + Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH4OBBVirtualCurveIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4OBBVirtualCurveIntersector4Hybrid(); + intersectors.intersector8 = BVH4OBBVirtualCurveIntersector8Hybrid(); + intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16Hybrid(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH4OBBVirtualCurveIntersectorRobust1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4OBBVirtualCurveIntersectorRobust4Hybrid(); + intersectors.intersector8 = BVH4OBBVirtualCurveIntersectorRobust8Hybrid(); + intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16Hybrid(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + default: assert(false); + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH4OBBVirtualCurveIntersector1MB(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4OBBVirtualCurveIntersector4HybridMB(); + intersectors.intersector8 = BVH4OBBVirtualCurveIntersector8HybridMB(); + intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16HybridMB(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH4OBBVirtualCurveIntersectorRobust1MB(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4OBBVirtualCurveIntersectorRobust4HybridMB(); + intersectors.intersector8 = BVH4OBBVirtualCurveIntersectorRobust8HybridMB(); + intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16HybridMB(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + default: assert(false); + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant) + { + assert(ivariant == IntersectVariant::FAST); + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4Intersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4_filter = BVH4Triangle4Intersector4HybridMoeller(); + intersectors.intersector4_nofilter = BVH4Triangle4Intersector4HybridMoellerNoFilter(); + intersectors.intersector8_filter = BVH4Triangle4Intersector8HybridMoeller(); + intersectors.intersector8_nofilter = BVH4Triangle4Intersector8HybridMoellerNoFilter(); + intersectors.intersector16_filter = BVH4Triangle4Intersector16HybridMoeller(); + intersectors.intersector16_nofilter = BVH4Triangle4Intersector16HybridMoellerNoFilter(); + intersectors.intersectorN_filter = BVH4Triangle4IntersectorStreamMoeller(); + intersectors.intersectorN_nofilter = BVH4Triangle4IntersectorStreamMoellerNoFilter(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + assert(ivariant == IntersectVariant::ROBUST); + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4vIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4vIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Triangle4vIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Triangle4vIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4Triangle4vIntersectorStreamPluecker(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4iIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4iIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Triangle4iIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4Triangle4iIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4Triangle4iIntersectorStreamMoeller(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4iIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4iIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Triangle4iIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Triangle4iIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4Triangle4iIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4vMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4vMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Triangle4vMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4vMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4vMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Triangle4vMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4iMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4iMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Triangle4iMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4iMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4iMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Triangle4iMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4vIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4_filter = BVH4Quad4vIntersector4HybridMoeller(); + intersectors.intersector4_nofilter = BVH4Quad4vIntersector4HybridMoellerNoFilter(); + intersectors.intersector8_filter = BVH4Quad4vIntersector8HybridMoeller(); + intersectors.intersector8_nofilter = BVH4Quad4vIntersector8HybridMoellerNoFilter(); + intersectors.intersector16_filter = BVH4Quad4vIntersector16HybridMoeller(); + intersectors.intersector16_nofilter = BVH4Quad4vIntersector16HybridMoellerNoFilter(); + intersectors.intersectorN_filter = BVH4Quad4vIntersectorStreamMoeller(); + intersectors.intersectorN_nofilter = BVH4Quad4vIntersectorStreamMoellerNoFilter(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4vIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4vIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Quad4vIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Quad4vIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4Quad4vIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4iIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4iIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Quad4iIntersector8HybridMoeller(); + intersectors.intersector16= BVH4Quad4iIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4Quad4iIntersectorStreamMoeller(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4iIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4iIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Quad4iIntersector8HybridPluecker(); + intersectors.intersector16= BVH4Quad4iIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4Quad4iIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4iMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridMoeller(); + intersectors.intersector16= BVH4Quad4iMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4iMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridPluecker(); + intersectors.intersector16= BVH4Quad4iMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::QBVH4Triangle4iIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH4Triangle4iIntersector1Pluecker(); + return intersectors; + } + + Accel::Intersectors BVH4Factory::QBVH4Quad4iIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH4Quad4iIntersector1Pluecker(); + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4UserGeometryIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4VirtualIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4VirtualIntersector4Chunk(); + intersectors.intersector8 = BVH4VirtualIntersector8Chunk(); + intersectors.intersector16 = BVH4VirtualIntersector16Chunk(); + intersectors.intersectorN = BVH4VirtualIntersectorStream(); +#endif + intersectors.collider = BVH4ColliderUserGeom(); + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4UserGeometryMBIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4VirtualMBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4VirtualMBIntersector4Chunk(); + intersectors.intersector8 = BVH4VirtualMBIntersector8Chunk(); + intersectors.intersector16 = BVH4VirtualMBIntersector16Chunk(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4InstanceIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4InstanceIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4InstanceIntersector4Chunk(); + intersectors.intersector8 = BVH4InstanceIntersector8Chunk(); + intersectors.intersector16 = BVH4InstanceIntersector16Chunk(); + intersectors.intersectorN = BVH4InstanceIntersectorStream(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4InstanceMBIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4InstanceMBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4InstanceMBIntersector4Chunk(); + intersectors.intersector8 = BVH4InstanceMBIntersector8Chunk(); + intersectors.intersector16 = BVH4InstanceMBIntersector16Chunk(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4SubdivPatch1Intersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4SubdivPatch1Intersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4SubdivPatch1Intersector4(); + intersectors.intersector8 = BVH4SubdivPatch1Intersector8(); + intersectors.intersector16 = BVH4SubdivPatch1Intersector16(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4SubdivPatch1MBIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4SubdivPatch1MBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4SubdivPatch1MBIntersector4(); + intersectors.intersector8 = BVH4SubdivPatch1MBIntersector8(); + intersectors.intersector16 = BVH4SubdivPatch1MBIntersector16(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel* BVH4Factory::BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve4i::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4i(),ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB"); + + return new AccelInstance(accel,builder,intersectors); + } + +#if defined(EMBREE_TARGET_SIMD8) + Accel* BVH4Factory::BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve8i::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8i(),ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB"); + + return new AccelInstance(accel,builder,intersectors); + } +#endif + + Accel* BVH4Factory::BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve4v::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4v(),ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve4iMB::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector4iMB(),ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB"); + + return new AccelInstance(accel,builder,intersectors); + } + +#if defined(EMBREE_TARGET_SIMD8) + Accel* BVH4Factory::BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve8iMB::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(), ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB"); + + return new AccelInstance(accel,builder,intersectors); + } +#endif + + Accel* BVH4Factory::BVH4Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4Intersectors(accel,ivariant); + else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4Intersectors(accel,IntersectVariant::FAST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4"); + + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); + else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); + else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,true); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4v::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4vIntersectors(accel,ivariant); + else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::FAST); + else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::ROBUST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4"); + + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); + else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); + else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,true); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4i::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4iIntersectors(accel,ivariant); + else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::FAST); + else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::ROBUST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4"); + + Builder* builder = nullptr; + if (scene->device->tri_builder == "default" ) { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); + else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); + else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,true); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4i::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4iMBIntersectors(accel,ivariant); + else if (scene->device->tri_traverser_mb == "fast" ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::FAST); + else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::ROBUST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4"); + + Builder* builder = nullptr; + if (scene->device->tri_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4vMB::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4vMBIntersectors(accel,ivariant); + else if (scene->device->tri_traverser_mb == "fast" ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::FAST); + else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::ROBUST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4"); + + Builder* builder = nullptr; + if (scene->device->tri_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Quad4v::type,scene); + Accel::Intersectors intersectors = BVH4Quad4vIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->quad_builder == "sah" ) builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0); + else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->quad_builder == "dynamic" ) builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Quad4i::type,scene); + Accel::Intersectors intersectors = BVH4Quad4iIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement + } + } + else if (scene->device->quad_builder == "sah") builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Quad4i::type,scene); + Accel::Intersectors intersectors = BVH4Quad4iMBIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->quad_builder_mb == "sah") builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH4"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4QuantizedQuad4i(Scene* scene) + { + BVH4* accel = new BVH4(Quad4i::type,scene); + Builder* builder = BVH4QuantizedQuad4iSceneBuilderSAH(accel,scene,0); + Accel::Intersectors intersectors = QBVH4Quad4iIntersectors(accel); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4QuantizedTriangle4i(Scene* scene) + { + BVH4* accel = new BVH4(Triangle4i::type,scene); + Builder* builder = BVH4QuantizedTriangle4iSceneBuilderSAH(accel,scene,0); + Accel::Intersectors intersectors = QBVH4Triangle4iIntersectors(accel); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4SubdivPatch1(Scene* scene) + { + BVH4* accel = new BVH4(SubdivPatch1::type,scene); + Accel::Intersectors intersectors = BVH4SubdivPatch1Intersectors(accel); + Builder* builder = BVH4SubdivPatch1BuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4SubdivPatch1MB(Scene* scene) + { + BVH4* accel = new BVH4(SubdivPatch1::type,scene); + Accel::Intersectors intersectors = BVH4SubdivPatch1MBIntersectors(accel); + Builder* builder = BVH4SubdivPatch1MBBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4UserGeometry(Scene* scene, BuildVariant bvariant) + { + BVH4* accel = new BVH4(Object::type,scene); + Accel::Intersectors intersectors = BVH4UserGeometryIntersectors(accel); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4VirtualSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->object_builder == "sah") builder = BVH4VirtualSceneBuilderSAH(accel,scene,0); + else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4UserGeometryMB(Scene* scene) + { + BVH4* accel = new BVH4(Object::type,scene); + Accel::Intersectors intersectors = BVH4UserGeometryMBIntersectors(accel); + Builder* builder = BVH4VirtualMBSceneBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant) + { + BVH4* accel = new BVH4(InstancePrimitive::type,scene); + Accel::Intersectors intersectors = BVH4InstanceIntersectors(accel); + auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP; + // Builder* builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->object_builder == "sah") builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); + else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4InstanceMB(Scene* scene, bool isExpensive) + { + BVH4* accel = new BVH4(InstancePrimitive::type,scene); + Accel::Intersectors intersectors = BVH4InstanceMBIntersectors(accel); + auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP; + Builder* builder = BVH4InstanceMBSceneBuilderSAH(accel,scene,gtype); + return new AccelInstance(accel,builder,intersectors); + } + + Accel::Intersectors BVH4Factory::BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + if (ivariant == IntersectVariant::FAST) + { + intersectors.intersector1 = BVH4GridIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4GridIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4GridIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4GridIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + } + else /* if (ivariant == IntersectVariant::ROBUST) */ + { + intersectors.intersector1 = BVH4GridIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4GridIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4GridIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4GridIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + } + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4GridMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4GridMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4GridMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4GridMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel* BVH4Factory::BVH4Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(SubGridQBVH4::type,scene); + Accel::Intersectors intersectors = BVH4GridIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + builder = BVH4GridSceneBuilderSAH(accel,scene,0); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(SubGridQBVH4::type,scene); + Accel::Intersectors intersectors = BVH4GridMBIntersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + builder = BVH4GridMBSceneBuilderSAH(accel,scene,0); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4MB"); + return new AccelInstance(accel,builder,intersectors); + } + +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h new file mode 100644 index 00000000000..a68227b41f8 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h @@ -0,0 +1,316 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_factory.h" + +namespace embree +{ + /*! BVH4 instantiations */ + class BVH4Factory : public BVHFactory + { + public: + BVH4Factory(int bfeatures, int ifeatures); + + public: + Accel* BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant); + Accel* BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant); + Accel* BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant); + Accel* BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant); + Accel* BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4i); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8i); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4v); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4iMB); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB); + + Accel* BVH4Triangle4 (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Triangle4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::ROBUST); + Accel* BVH4Triangle4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + Accel* BVH4Quad4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Quad4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + Accel* BVH4QuantizedTriangle4i(Scene* scene); + Accel* BVH4QuantizedQuad4i(Scene* scene); + + Accel* BVH4SubdivPatch1(Scene* scene); + Accel* BVH4SubdivPatch1MB(Scene* scene); + + Accel* BVH4UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC); + Accel* BVH4UserGeometryMB(Scene* scene); + + Accel* BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC); + Accel* BVH4InstanceMB(Scene* scene, bool isExpensive); + + Accel* BVH4Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + private: + void selectBuilders(int features); + void selectIntersectors(int features); + + private: + Accel::Intersectors BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); + Accel::Intersectors BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); + + Accel::Intersectors BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant); + + Accel::Intersectors BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant); + + Accel::Intersectors QBVH4Quad4iIntersectors(BVH4* bvh); + Accel::Intersectors QBVH4Triangle4iIntersectors(BVH4* bvh); + + Accel::Intersectors BVH4UserGeometryIntersectors(BVH4* bvh); + Accel::Intersectors BVH4UserGeometryMBIntersectors(BVH4* bvh); + + Accel::Intersectors BVH4InstanceIntersectors(BVH4* bvh); + Accel::Intersectors BVH4InstanceMBIntersectors(BVH4* bvh); + + Accel::Intersectors BVH4SubdivPatch1Intersectors(BVH4* bvh); + Accel::Intersectors BVH4SubdivPatch1MBIntersectors(BVH4* bvh); + + Accel::Intersectors BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant); + + private: + + DEFINE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker); + + // ============== + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker); + + // ============== + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker); + + // ============== + + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4IntersectorStreamPacketFallback); + + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoellerNoFilter); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4vIntersectorStreamPluecker); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoellerNoFilter); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamPluecker); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream); + + // SAH scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + + DEFINE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + // spatial scene builder + private: + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + + // twolevel scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp new file mode 100644 index 00000000000..9fe057c3923 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp @@ -0,0 +1,1165 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "../common/isa.h" // to define EMBREE_TARGET_SIMD8 + +#if defined (EMBREE_TARGET_SIMD8) + +#include "bvh8_factory.h" +#include "../bvh/bvh.h" + +#include "../geometry/curveNv.h" +#include "../geometry/curveNi.h" +#include "../geometry/curveNi_mb.h" +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/subdivpatch1.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" +#include "../common/accelinstance.h" + +namespace embree +{ + DECLARE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom); + + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream); + + DECLARE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); + + BVH8Factory::BVH8Factory(int bfeatures, int ifeatures) + { + SELECT_SYMBOL_INIT_AVX(ifeatures,BVH8ColliderUserGeom); + + selectBuilders(bfeatures); + selectIntersectors(ifeatures); + } + + void BVH8Factory::selectBuilders(int features) + { + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8Curve8vBuilder_OBB_New)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8OBBCurve8iMBBuilder_OBB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4SceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4iSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4iMBSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vMBSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4iSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4SceneBuilderSAH)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4vSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4iSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4iMBSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedQuad4iSceneBuilderSAH)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualSceneBuilderSAH)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualMBSceneBuilderSAH)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceSceneBuilderSAH)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceMBSceneBuilderSAH)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridSceneBuilderSAH)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridMBSceneBuilderSAH)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4SceneBuilderFastSpatialSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vSceneBuilderFastSpatialSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4vSceneBuilderFastSpatialSAH)); + + IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4MeshSAH)); + IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4vMeshSAH)); + IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4iMeshSAH)); + IF_ENABLED_QUADS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelQuadMeshSAH)); + IF_ENABLED_USER (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelVirtualSAH)); + IF_ENABLED_INSTANCE (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelInstanceSAH)); + } + + void BVH8Factory::selectIntersectors(int features) + { + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8v)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8iMB)); + + /* select intersectors1 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector1)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector1MB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust1)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust1MB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector1Pluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector1Woop)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector1Pluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector1Pluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector1Pluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector1Pluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Triangle4iIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Triangle4Intersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Quad4iIntersector1Pluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersector1)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualMBIntersector1)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersector1)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceMBIntersector1)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector1Moeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridMBIntersector1Moeller)) + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector1Pluecker)); + +#if defined (EMBREE_RAY_PACKETS) + + /* select intersectors4 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector4Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector4HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust4Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust4HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector4HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vIntersector4HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector4HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector4HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector4HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector4HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualIntersector4Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualMBIntersector4Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceIntersector4Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceMBIntersector4Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector4HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector4HybridPluecker)); + + /* select intersectors8 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector8Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector8HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust8Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust8HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector8HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vIntersector8HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector8HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector8HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector8HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector8HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualIntersector8Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualMBIntersector8Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceIntersector8Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceMBIntersector8Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector8HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector8HybridPluecker)); + + /* select intersectors16 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector16Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector16HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust16Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust16HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector16HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector16HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector16HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector16HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector16HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector16HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector16HybridPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersector16Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8VirtualMBIntersector16Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersector16Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8InstanceMBIntersector16Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8GridIntersector16HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8GridIntersector16HybridPluecker)); + + /* select stream intersectors */ + + SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8IntersectorStreamPacketFallback); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4IntersectorStreamMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4IntersectorStreamMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersectorStreamMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersectorStreamPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersectorStreamPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersectorStreamMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersectorStreamPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersectorStream)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersectorStream)); + +#endif + } + + Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH8OBBVirtualCurveIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8OBBVirtualCurveIntersector4Hybrid(); + intersectors.intersector8 = BVH8OBBVirtualCurveIntersector8Hybrid(); + intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16Hybrid(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH8OBBVirtualCurveIntersectorRobust1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8OBBVirtualCurveIntersectorRobust4Hybrid(); + intersectors.intersector8 = BVH8OBBVirtualCurveIntersectorRobust8Hybrid(); + intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16Hybrid(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + default: assert(false); + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH8OBBVirtualCurveIntersector1MB(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8OBBVirtualCurveIntersector4HybridMB(); + intersectors.intersector8 = BVH8OBBVirtualCurveIntersector8HybridMB(); + intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16HybridMB(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH8OBBVirtualCurveIntersectorRobust1MB(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8OBBVirtualCurveIntersectorRobust4HybridMB(); + intersectors.intersector8 = BVH8OBBVirtualCurveIntersectorRobust8HybridMB(); + intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16HybridMB(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + default: assert(false); + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant) + { + assert(ivariant == IntersectVariant::FAST); + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4Intersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4_filter = BVH8Triangle4Intersector4HybridMoeller(); + intersectors.intersector4_nofilter = BVH8Triangle4Intersector4HybridMoellerNoFilter(); + intersectors.intersector8_filter = BVH8Triangle4Intersector8HybridMoeller(); + intersectors.intersector8_nofilter = BVH8Triangle4Intersector8HybridMoellerNoFilter(); + intersectors.intersector16_filter = BVH8Triangle4Intersector16HybridMoeller(); + intersectors.intersector16_nofilter = BVH8Triangle4Intersector16HybridMoellerNoFilter(); + intersectors.intersectorN_filter = BVH8Triangle4IntersectorStreamMoeller(); + intersectors.intersectorN_nofilter = BVH8Triangle4IntersectorStreamMoellerNoFilter(); +#endif + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; +#define ENABLE_WOOP_TEST 0 +#if ENABLE_WOOP_TEST == 0 + //assert(ivariant == IntersectVariant::ROBUST); + intersectors.intersector1 = BVH8Triangle4vIntersector1Pluecker(); +#else + intersectors.intersector1 = BVH8Triangle4vIntersector1Woop(); +#endif + +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4vIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Triangle4vIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Triangle4vIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8Triangle4vIntersectorStreamPluecker(); +#endif + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4iIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4iIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Triangle4iIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Triangle4iIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8Triangle4iIntersectorStreamMoeller(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4iIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4iIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Triangle4iIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Triangle4iIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8Triangle4iIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4vMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4vMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Triangle4vMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4vMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4vMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Triangle4vMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4iMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4iMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Triangle4iMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4iMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4iMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Triangle4iMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4vIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4_filter = BVH8Quad4vIntersector4HybridMoeller(); + intersectors.intersector4_nofilter = BVH8Quad4vIntersector4HybridMoellerNoFilter(); + intersectors.intersector8_filter = BVH8Quad4vIntersector8HybridMoeller(); + intersectors.intersector8_nofilter = BVH8Quad4vIntersector8HybridMoellerNoFilter(); + intersectors.intersector16_filter = BVH8Quad4vIntersector16HybridMoeller(); + intersectors.intersector16_nofilter = BVH8Quad4vIntersector16HybridMoellerNoFilter(); + intersectors.intersectorN_filter = BVH8Quad4vIntersectorStreamMoeller(); + intersectors.intersectorN_nofilter = BVH8Quad4vIntersectorStreamMoellerNoFilter(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4vIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4vIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Quad4vIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Quad4vIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8Quad4vIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4iIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4iIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Quad4iIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Quad4iIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8Quad4iIntersectorStreamMoeller(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4iIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4iIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Quad4iIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Quad4iIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8Quad4iIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4iMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4iMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Quad4iMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4iMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4iMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Quad4iMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::QBVH8Triangle4iIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH8Triangle4iIntersector1Pluecker(); + return intersectors; + } + + Accel::Intersectors BVH8Factory::QBVH8Triangle4Intersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH8Triangle4Intersector1Moeller(); + return intersectors; + } + + Accel::Intersectors BVH8Factory::QBVH8Quad4iIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH8Quad4iIntersector1Pluecker(); + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8UserGeometryIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8VirtualIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8VirtualIntersector4Chunk(); + intersectors.intersector8 = BVH8VirtualIntersector8Chunk(); + intersectors.intersector16 = BVH8VirtualIntersector16Chunk(); + intersectors.intersectorN = BVH8VirtualIntersectorStream(); +#endif + intersectors.collider = BVH8ColliderUserGeom(); + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8UserGeometryMBIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8VirtualMBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8VirtualMBIntersector4Chunk(); + intersectors.intersector8 = BVH8VirtualMBIntersector8Chunk(); + intersectors.intersector16 = BVH8VirtualMBIntersector16Chunk(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8InstanceIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8InstanceIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8InstanceIntersector4Chunk(); + intersectors.intersector8 = BVH8InstanceIntersector8Chunk(); + intersectors.intersector16 = BVH8InstanceIntersector16Chunk(); + intersectors.intersectorN = BVH8InstanceIntersectorStream(); +#endif + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8InstanceMBIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8InstanceMBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8InstanceMBIntersector4Chunk(); + intersectors.intersector8 = BVH8InstanceMBIntersector8Chunk(); + intersectors.intersector16 = BVH8InstanceMBIntersector16Chunk(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel* BVH8Factory::BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Curve8v::type,scene); + Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8v(),ivariant); + Builder* builder = BVH8Curve8vBuilder_OBB_New(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Curve8iMB::type,scene); + Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(),ivariant); + Builder* builder = BVH8OBBCurve8iMBBuilder_OBB(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4::type,scene); + Accel::Intersectors intersectors= BVH8Triangle4Intersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah" ) builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_fast_spatial") builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_presplit") builder = BVH8Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); + else if (scene->device->tri_builder == "dynamic" ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); + else if (scene->device->tri_builder == "morton" ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,true); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4v::type,scene); + Accel::Intersectors intersectors= BVH8Triangle4vIntersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4vSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah_fast_spatial") builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8"); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4i::type,scene); + Accel::Intersectors intersectors = BVH8Triangle4iIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4iSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement + } + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4i::type,scene); + Accel::Intersectors intersectors = BVH8Triangle4iMBIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->tri_builder_mb == "default") { // FIXME: implement + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4vMB::type,scene); + Accel::Intersectors intersectors= BVH8Triangle4vMBIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->tri_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8QuantizedTriangle4i(Scene* scene) + { + BVH8* accel = new BVH8(Triangle4i::type,scene); + Accel::Intersectors intersectors = QBVH8Triangle4iIntersectors(accel); + Builder* builder = BVH8QuantizedTriangle4iSceneBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8QuantizedTriangle4(Scene* scene) + { + BVH8* accel = new BVH8(Triangle4::type,scene); + Accel::Intersectors intersectors = QBVH8Triangle4Intersectors(accel); + Builder* builder = BVH8QuantizedTriangle4SceneBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Quad4v::type,scene); + Accel::Intersectors intersectors = BVH8Quad4vIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Quad4vSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->quad_builder == "dynamic" ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false); + else if (scene->device->quad_builder == "morton" ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,true); + else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Quad4i::type,scene); + Accel::Intersectors intersectors = BVH8Quad4iIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Quad4iSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement + } + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Quad4i::type,scene); + Accel::Intersectors intersectors = BVH8Quad4iMBIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Quad4iMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH8"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8QuantizedQuad4i(Scene* scene) + { + BVH8* accel = new BVH8(Quad4i::type,scene); + Accel::Intersectors intersectors = QBVH8Quad4iIntersectors(accel); + Builder* builder = nullptr; + if (scene->device->quad_builder == "default" ) builder = BVH8QuantizedQuad4iSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for QBVH8"); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8UserGeometry(Scene* scene, BuildVariant bvariant) + { + BVH8* accel = new BVH8(Object::type,scene); + Accel::Intersectors intersectors = BVH8UserGeometryIntersectors(accel); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8VirtualSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->object_builder == "sah") builder = BVH8VirtualSceneBuilderSAH(accel,scene,0); + else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8UserGeometryMB(Scene* scene) + { + BVH8* accel = new BVH8(Object::type,scene); + Accel::Intersectors intersectors = BVH8UserGeometryMBIntersectors(accel); + Builder* builder = BVH8VirtualMBSceneBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant) + { + BVH8* accel = new BVH8(InstancePrimitive::type,scene); + Accel::Intersectors intersectors = BVH8InstanceIntersectors(accel); + auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; + // Builder* builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);; break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->object_builder == "sah") builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype); + else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8InstanceMB(Scene* scene, bool isExpensive) + { + BVH8* accel = new BVH8(InstancePrimitive::type,scene); + Accel::Intersectors intersectors = BVH8InstanceMBIntersectors(accel); + auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; + Builder* builder = BVH8InstanceMBSceneBuilderSAH(accel,scene,gtype); + return new AccelInstance(accel,builder,intersectors); + } + + Accel::Intersectors BVH8Factory::BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + if (ivariant == IntersectVariant::FAST) + { + intersectors.intersector1 = BVH8GridIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8GridIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8GridIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8GridIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + } + else /* if (ivariant == IntersectVariant::ROBUST) */ + { + intersectors.intersector1 = BVH8GridIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8GridIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8GridIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8GridIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + } + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8GridMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = nullptr; + intersectors.intersector8 = nullptr; + intersectors.intersector16 = nullptr; + intersectors.intersectorN = nullptr; +#endif + return intersectors; + } + + Accel* BVH8Factory::BVH8Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(SubGridQBVH8::type,scene); + Accel::Intersectors intersectors = BVH8GridIntersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->grid_builder == "default") { + builder = BVH8GridSceneBuilderSAH(accel,scene,0); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(SubGridQBVH8::type,scene); + Accel::Intersectors intersectors = BVH8GridMBIntersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->grid_builder_mb == "default") { + builder = BVH8GridMBSceneBuilderSAH(accel,scene,0); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8MB"); + return new AccelInstance(accel,builder,intersectors); + } +} + +#endif diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h new file mode 100644 index 00000000000..b92188e7d35 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h @@ -0,0 +1,280 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_factory.h" + +namespace embree +{ + /*! BVH8 instantiations */ + class BVH8Factory : public BVHFactory + { + public: + BVH8Factory(int bfeatures, int ifeatures); + + public: + Accel* BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant); + Accel* BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB); + + Accel* BVH8Triangle4 (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Triangle4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Triangle4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + Accel* BVH8Quad4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Quad4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + Accel* BVH8QuantizedTriangle4i(Scene* scene); + Accel* BVH8QuantizedTriangle4(Scene* scene); + Accel* BVH8QuantizedQuad4i(Scene* scene); + + Accel* BVH8UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC); + Accel* BVH8UserGeometryMB(Scene* scene); + + Accel* BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC); + Accel* BVH8InstanceMB(Scene* scene, bool isExpensive); + + Accel* BVH8Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + private: + void selectBuilders(int features); + void selectIntersectors(int features); + + private: + Accel::Intersectors BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); + Accel::Intersectors BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); + + Accel::Intersectors BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant); + + Accel::Intersectors BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant); + + Accel::Intersectors QBVH8Triangle4iIntersectors(BVH8* bvh); + Accel::Intersectors QBVH8Triangle4Intersectors(BVH8* bvh); + Accel::Intersectors QBVH8Quad4iIntersectors(BVH8* bvh); + + Accel::Intersectors BVH8UserGeometryIntersectors(BVH8* bvh); + Accel::Intersectors BVH8UserGeometryMBIntersectors(BVH8* bvh); + + Accel::Intersectors BVH8InstanceIntersectors(BVH8* bvh); + Accel::Intersectors BVH8InstanceMBIntersectors(BVH8* bvh); + + Accel::Intersectors BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant); + + private: + DEFINE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream); + + // SAH scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + + DEFINE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + // SAH spatial scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + + // twolevel scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp new file mode 100644 index 00000000000..e832537ec57 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp @@ -0,0 +1,60 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_builder.h" + +namespace embree +{ + namespace isa + { + template + typename BVHN::NodeRef BVHNBuilderVirtual::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) + { + auto createLeafFunc = [&] (const PrimRef* prims, const range& set, const Allocator& alloc) -> NodeRef { + return createLeaf(prims,set,alloc); + }; + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + return BVHBuilderBinnedSAH::build + (FastAllocator::Create(allocator),typename BVH::AABBNode::Create2(),typename BVH::AABBNode::Set3(allocator,prims),createLeafFunc,progressFunc,prims,pinfo,settings); + } + + + template + typename BVHN::NodeRef BVHNBuilderQuantizedVirtual::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) + { + auto createLeafFunc = [&] (const PrimRef* prims, const range& set, const Allocator& alloc) -> NodeRef { + return createLeaf(prims,set,alloc); + }; + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + return BVHBuilderBinnedSAH::build + (FastAllocator::Create(allocator),typename BVH::QuantizedNode::Create2(),typename BVH::QuantizedNode::Set2(),createLeafFunc,progressFunc,prims,pinfo,settings); + } + + template + typename BVHN::NodeRecordMB BVHNBuilderMblurVirtual::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange) + { + auto createLeafFunc = [&] (const PrimRef* prims, const range& set, const Allocator& alloc) -> NodeRecordMB { + return createLeaf(prims,set,alloc); + }; + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + return BVHBuilderBinnedSAH::build + (FastAllocator::Create(allocator),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::SetTimeRange(timeRange),createLeafFunc,progressFunc,prims,pinfo,settings); + } + + template struct BVHNBuilderVirtual<4>; + template struct BVHNBuilderQuantizedVirtual<4>; + template struct BVHNBuilderMblurVirtual<4>; + +#if defined(__AVX__) + template struct BVHNBuilderVirtual<8>; + template struct BVHNBuilderQuantizedVirtual<8>; + template struct BVHNBuilderMblurVirtual<8>; +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h new file mode 100644 index 00000000000..1b86bb45ada --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h @@ -0,0 +1,114 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "../builders/bvh_builder_sah.h" + +namespace embree +{ + namespace isa + { + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + template + struct BVHNBuilderVirtual + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef FastAllocator::CachedAllocator Allocator; + + struct BVHNBuilderV { + NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings); + virtual NodeRef createLeaf (const PrimRef* prims, const range& set, const Allocator& alloc) = 0; + }; + + template + struct BVHNBuilderT : public BVHNBuilderV + { + BVHNBuilderT (CreateLeafFunc createLeafFunc) + : createLeafFunc(createLeafFunc) {} + + NodeRef createLeaf (const PrimRef* prims, const range& set, const Allocator& alloc) { + return createLeafFunc(prims,set,alloc); + } + + private: + CreateLeafFunc createLeafFunc; + }; + + template + static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) { + return BVHNBuilderT(createLeaf).build(allocator,progress,prims,pinfo,settings); + } + }; + + template + struct BVHNBuilderQuantizedVirtual + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef FastAllocator::CachedAllocator Allocator; + + struct BVHNBuilderV { + NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings); + virtual NodeRef createLeaf (const PrimRef* prims, const range& set, const Allocator& alloc) = 0; + }; + + template + struct BVHNBuilderT : public BVHNBuilderV + { + BVHNBuilderT (CreateLeafFunc createLeafFunc) + : createLeafFunc(createLeafFunc) {} + + NodeRef createLeaf (const PrimRef* prims, const range& set, const Allocator& alloc) { + return createLeafFunc(prims,set,alloc); + } + + private: + CreateLeafFunc createLeafFunc; + }; + + template + static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) { + return BVHNBuilderT(createLeaf).build(allocator,progress,prims,pinfo,settings); + } + }; + + template + struct BVHNBuilderMblurVirtual + { + typedef BVHN BVH; + typedef typename BVH::AABBNodeMB AABBNodeMB; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB NodeRecordMB; + typedef FastAllocator::CachedAllocator Allocator; + + struct BVHNBuilderV { + NodeRecordMB build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange); + virtual NodeRecordMB createLeaf (const PrimRef* prims, const range& set, const Allocator& alloc) = 0; + }; + + template + struct BVHNBuilderT : public BVHNBuilderV + { + BVHNBuilderT (CreateLeafFunc createLeafFunc) + : createLeafFunc(createLeafFunc) {} + + NodeRecordMB createLeaf (const PrimRef* prims, const range& set, const Allocator& alloc) { + return createLeafFunc(prims,set,alloc); + } + + private: + CreateLeafFunc createLeafFunc; + }; + + template + static NodeRecordMB build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange) { + return BVHNBuilderT(createLeaf).build(allocator,progress,prims,pinfo,settings,timeRange); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp new file mode 100644 index 00000000000..64759c1294a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp @@ -0,0 +1,531 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_statistics.h" +#include "bvh_rotate.h" +#include "../common/profile.h" +#include "../../common/algorithms/parallel_prefix_sum.h" + +#include "../builders/primrefgen.h" +#include "../builders/bvh_builder_morton.h" + +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" + +#if defined(__X86_64__) || defined(__aarch64__) +# define ROTATE_TREE 1 // specifies number of tree rotation rounds to perform +#else +# define ROTATE_TREE 0 // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues +#endif + +namespace embree +{ + namespace isa + { + template + struct SetBVHNBounds + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + typedef typename BVH::AABBNode AABBNode; + + BVH* bvh; + __forceinline SetBVHNBounds (BVH* bvh) : bvh(bvh) {} + + __forceinline NodeRecord operator() (NodeRef ref, const NodeRecord* children, size_t num) + { + AABBNode* node = ref.getAABBNode(); + + BBox3fa res = empty; + for (size_t i=0; isetRef(i,children[i].ref); + node->setBounds(i,b); + } + + BBox3fx result = (BBox3fx&)res; +#if ROTATE_TREE + if (N == 4) + { + size_t n = 0; + for (size_t i=0; i= 4096) { + for (size_t i=0; i::rotate(node->child(i)); + node->child(i).setBarrier(); + } + } + } + result.lower.a = unsigned(n); + } +#endif + + return NodeRecord(ref,result); + } + }; + + template + struct CreateMortonLeaf; + + template + struct CreateMortonLeaf + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items<=4); + + /* allocate leaf node */ + Triangle4* accel = (Triangle4*) alloc.malloc1(sizeof(Triangle4),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,1); + vuint4 vgeomID = -1, vprimID = -1; + Vec3vf4 v0 = zero, v1 = zero, v2 = zero; + const TriangleMesh* __restrict__ const mesh = this->mesh; + + for (size_t i=0; itriangle(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + vgeomID [i] = geomID_; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + } + + Triangle4::store_nt(accel,Triangle4(v0,v1,v2,vgeomID,vprimID)); + BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = unsigned(current.size()); +#endif + return NodeRecord(ref,box_o); + } + + private: + TriangleMesh* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits::max(); + }; + + template + struct CreateMortonLeaf + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items<=4); + + /* allocate leaf node */ + Triangle4v* accel = (Triangle4v*) alloc.malloc1(sizeof(Triangle4v),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,1); + vuint4 vgeomID = -1, vprimID = -1; + Vec3vf4 v0 = zero, v1 = zero, v2 = zero; + const TriangleMesh* __restrict__ mesh = this->mesh; + + for (size_t i=0; itriangle(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + vgeomID [i] = geomID_; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + } + Triangle4v::store_nt(accel,Triangle4v(v0,v1,v2,vgeomID,vprimID)); + BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + TriangleMesh* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits::max(); + }; + + template + struct CreateMortonLeaf + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items<=4); + + /* allocate leaf node */ + Triangle4i* accel = (Triangle4i*) alloc.malloc1(sizeof(Triangle4i),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,1); + + vuint4 v0 = zero, v1 = zero, v2 = zero; + vuint4 vgeomID = -1, vprimID = -1; + const TriangleMesh* __restrict__ const mesh = this->mesh; + + for (size_t i=0; itriangle(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + vgeomID[i] = geomID_; + vprimID[i] = primID; + unsigned int int_stride = mesh->vertices0.getStride()/4; + v0[i] = tri.v[0] * int_stride; + v1[i] = tri.v[1] * int_stride; + v2[i] = tri.v[2] * int_stride; + } + + for (size_t i=items; i<4; i++) + { + vgeomID[i] = vgeomID[0]; + vprimID[i] = -1; + v0[i] = 0; + v1[i] = 0; + v2[i] = 0; + } + Triangle4i::store_nt(accel,Triangle4i(v0,v1,v2,vgeomID,vprimID)); + BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + TriangleMesh* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits::max(); + }; + + template + struct CreateMortonLeaf + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (QuadMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items<=4); + + /* allocate leaf node */ + Quad4v* accel = (Quad4v*) alloc.malloc1(sizeof(Quad4v),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,1); + + vuint4 vgeomID = -1, vprimID = -1; + Vec3vf4 v0 = zero, v1 = zero, v2 = zero, v3 = zero; + const QuadMesh* __restrict__ mesh = this->mesh; + + for (size_t i=0; iquad(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + const Vec3fa& p3 = mesh->vertex(tri.v[3]); + lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3); + upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3); + vgeomID [i] = geomID_; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z; + } + Quad4v::store_nt(accel,Quad4v(v0,v1,v2,v3,vgeomID,vprimID)); + BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + QuadMesh* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits::max(); + }; + + template + struct CreateMortonLeaf + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (UserGeometry* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + + /* allocate leaf node */ + Object* accel = (Object*) alloc.malloc1(items*sizeof(Object),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,items); + const UserGeometry* mesh = this->mesh; + + BBox3fa bounds = empty; + for (size_t i=0; ibounds(primID)); + new (&accel[i]) Object(geomID_,primID); + } + + BBox3fx box_o = (BBox3fx&)bounds; +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + UserGeometry* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits::max(); + }; + + template + struct CreateMortonLeaf + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (Instance* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items <= 1); + + /* allocate leaf node */ + InstancePrimitive* accel = (InstancePrimitive*) alloc.malloc1(items*sizeof(InstancePrimitive),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,items); + const Instance* instance = this->mesh; + + BBox3fa bounds = empty; + for (size_t i=0; ibounds(primID)); + new (&accel[i]) InstancePrimitive(instance, geomID_); + } + + BBox3fx box_o = (BBox3fx&)bounds; +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + Instance* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits::max(); + }; + + template + struct CalculateMeshBounds + { + __forceinline CalculateMeshBounds (Mesh* mesh) + : mesh(mesh) {} + + __forceinline const BBox3fa operator() (const BVHBuilderMorton::BuildPrim& morton) { + return mesh->bounds(morton.index); + } + + private: + Mesh* mesh; + }; + + template + class BVHNMeshBuilderMorton : public Builder + { + typedef BVHN BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + public: + + BVHNMeshBuilderMorton (BVH* bvh, Mesh* mesh, unsigned int geomID, const size_t minLeafSize, const size_t maxLeafSize, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD) + : bvh(bvh), mesh(mesh), morton(bvh->device,0), settings(N,BVH::maxBuildDepth,minLeafSize,min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks),singleThreadThreshold), geomID_(geomID) {} + + /* build function */ + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + morton.clear(); + } + size_t numPrimitives = mesh->size(); + numPreviousPrimitives = numPrimitives; + + /* skip build for empty scene */ + if (numPrimitives == 0) { + bvh->set(BVH::emptyNode,empty,0); + return; + } + + /* preallocate arrays */ + morton.resize(numPrimitives); + size_t bytesEstimated = numPrimitives*sizeof(AABBNode)/(4*N) + size_t(1.2f*Primitive::blocks(numPrimitives)*sizeof(Primitive)); + size_t bytesMortonCodes = numPrimitives*sizeof(BVHBuilderMorton::BuildPrim); + bytesEstimated = max(bytesEstimated,bytesMortonCodes); // the first allocation block is reused to sort the morton codes + bvh->alloc.init(bytesMortonCodes,bytesMortonCodes,bytesEstimated); + + /* create morton code array */ + BVHBuilderMorton::BuildPrim* dest = (BVHBuilderMorton::BuildPrim*) bvh->alloc.specialAlloc(bytesMortonCodes); + size_t numPrimitivesGen = createMortonCodeArray(mesh,morton,bvh->scene->progressInterface); + + /* create BVH */ + SetBVHNBounds setBounds(bvh); + CreateMortonLeaf createLeaf(mesh,geomID_,morton.data()); + CalculateMeshBounds calculateBounds(mesh); + auto root = BVHBuilderMorton::build( + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNode::Create(), + setBounds,createLeaf,calculateBounds,bvh->scene->progressInterface, + morton.data(),dest,numPrimitivesGen,settings); + + bvh->set(root.ref,LBBox3fa(root.bounds),numPrimitives); + +#if ROTATE_TREE + if (N == 4) + { + for (int i=0; i::rotate(bvh->root); + bvh->clearBarrier(bvh->root); + } +#endif + + /* clear temporary data for static geometry */ + if (bvh->scene->isStaticAccel()) { + morton.clear(); + } + bvh->cleanup(); + } + + void clear() { + morton.clear(); + } + + private: + BVH* bvh; + Mesh* mesh; + mvector morton; + BVHBuilderMorton::Settings settings; + unsigned int geomID_ = std::numeric_limits::max(); + unsigned int numPreviousPrimitives = 0; + }; + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4Triangle4MeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4> ((BVH4*)bvh,mesh,geomID,4,4); } + Builder* BVH4Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4v>((BVH4*)bvh,mesh,geomID,4,4); } + Builder* BVH4Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4i>((BVH4*)bvh,mesh,geomID,4,4); } +#if defined(__AVX__) + Builder* BVH8Triangle4MeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4> ((BVH8*)bvh,mesh,geomID,4,4); } + Builder* BVH8Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4v>((BVH8*)bvh,mesh,geomID,4,4); } + Builder* BVH8Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4i>((BVH8*)bvh,mesh,geomID,4,4); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,QuadMesh,Quad4v>((BVH4*)bvh,mesh,geomID,4,4); } +#if defined(__AVX__) + Builder* BVH8Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,QuadMesh,Quad4v>((BVH8*)bvh,mesh,geomID,4,4); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH4VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,UserGeometry,Object>((BVH4*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); } +#if defined(__AVX__) + Builder* BVH8VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,UserGeometry,Object>((BVH8*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,Instance,InstancePrimitive>((BVH4*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); } +#if defined(__AVX__) + Builder* BVH8InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,Instance,InstancePrimitive>((BVH8*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); } +#endif +#endif + + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp new file mode 100644 index 00000000000..cf5b2eb47f8 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp @@ -0,0 +1,640 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_builder.h" +#include "../builders/primrefgen.h" +#include "../builders/splitter.h" + +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" + +#include "../common/state.h" +#include "../../common/algorithms/parallel_for_for.h" +#include "../../common/algorithms/parallel_for_for_prefix_sum.h" + +#define PROFILE 0 +#define PROFILE_RUNS 20 + +namespace embree +{ + namespace isa + { + template + struct CreateLeaf + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + + __forceinline CreateLeaf (BVH* bvh) : bvh(bvh) {} + + __forceinline NodeRef operator() (const PrimRef* prims, const range& set, const FastAllocator::CachedAllocator& alloc) const + { + size_t n = set.size(); + size_t items = Primitive::blocks(n); + size_t start = set.begin(); + Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items); + for (size_t i=0; iscene); + } + return node; + } + + BVH* bvh; + }; + + + template + struct CreateLeafQuantized + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + + __forceinline CreateLeafQuantized (BVH* bvh) : bvh(bvh) {} + + __forceinline NodeRef operator() (const PrimRef* prims, const range& set, const FastAllocator::CachedAllocator& alloc) const + { + size_t n = set.size(); + size_t items = Primitive::blocks(n); + size_t start = set.begin(); + Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items); + for (size_t i=0; iscene); + } + return node; + } + + BVH* bvh; + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + template + struct BVHNBuilderSAH : public Builder + { + typedef BVHN BVH; + typedef typename BVHN::NodeRef NodeRef; + + BVH* bvh; + Scene* scene; + Geometry* mesh; + mvector prims; + GeneralBVHBuilder::Settings settings; + Geometry::GTypeMask gtype_; + unsigned int geomID_ = std::numeric_limits::max (); + bool primrefarrayalloc; + unsigned int numPreviousPrimitives = 0; + + BVHNBuilderSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, + const Geometry::GTypeMask gtype, bool primrefarrayalloc = false) + : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), + settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), primrefarrayalloc(primrefarrayalloc) {} + + BVHNBuilderSAH (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) + : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID), primrefarrayalloc(false) {} + + // FIXME: shrink bvh->alloc in destructor here and in other builders too + + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh && mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + } + + /* if we use the primrefarray for allocations we have to take it back from the BVH */ + if (settings.primrefarrayalloc != size_t(inf)) + bvh->alloc.unshare(prims); + + /* skip build for empty scene */ + const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false); + numPreviousPrimitives = numPrimitives; + if (numPrimitives == 0) { + bvh->clear(); + prims.clear(); + return; + } + + double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH"); + +#if PROFILE + profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) { +#endif + + /* create primref array */ + if (primrefarrayalloc) { + settings.primrefarrayalloc = numPrimitives/1000; + if (settings.primrefarrayalloc < 1000) + settings.primrefarrayalloc = inf; + } + + /* enable os_malloc for two level build */ + if (mesh) + bvh->alloc.setOSallocation(true); + + /* initialize allocator */ + const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes); + prims.resize(numPrimitives); + + PrimInfo pinfo = mesh ? + createPrimRefArray(mesh,geomID_,prims,bvh->scene->progressInterface) : + createPrimRefArray(scene,gtype_,false,prims,bvh->scene->progressInterface); + + /* pinfo might has zero size due to invalid geometry */ + if (unlikely(pinfo.size() == 0)) + { + bvh->clear(); + prims.clear(); + return; + } + + /* call BVH builder */ + NodeRef root = BVHNBuilderVirtual::build(&bvh->alloc,CreateLeaf(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings); + bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); + bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f)); + +#if PROFILE + }); +#endif + + /* if we allocated using the primrefarray we have to keep it alive */ + if (settings.primrefarrayalloc != size_t(inf)) + bvh->alloc.share(prims); + + /* for static geometries we can do some cleanups */ + else if (scene && scene->isStaticAccel()) { + prims.clear(); + } + bvh->cleanup(); + bvh->postBuild(t0); + } + + void clear() { + prims.clear(); + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + template + struct BVHNBuilderSAHQuantized : public Builder + { + typedef BVHN BVH; + typedef typename BVHN::NodeRef NodeRef; + + BVH* bvh; + Scene* scene; + Geometry* mesh; + mvector prims; + GeneralBVHBuilder::Settings settings; + Geometry::GTypeMask gtype_; + unsigned int geomID_ = std::numeric_limits::max(); + unsigned int numPreviousPrimitives = 0; + + BVHNBuilderSAHQuantized (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) + : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype) {} + + BVHNBuilderSAHQuantized (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) + : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID) {} + + // FIXME: shrink bvh->alloc in destructor here and in other builders too + + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh && mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + } + + /* skip build for empty scene */ + const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false); + numPreviousPrimitives = numPrimitives; + if (numPrimitives == 0) { + prims.clear(); + bvh->clear(); + return; + } + + double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::QBVH" + toString(N) + "BuilderSAH"); + +#if PROFILE + profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) { +#endif + /* create primref array */ + prims.resize(numPrimitives); + PrimInfo pinfo = mesh ? + createPrimRefArray(mesh,geomID_,prims,bvh->scene->progressInterface) : + createPrimRefArray(scene,gtype_,false,prims,bvh->scene->progressInterface); + + /* enable os_malloc for two level build */ + if (mesh) + bvh->alloc.setOSallocation(true); + + /* call BVH builder */ + const size_t node_bytes = numPrimitives*sizeof(typename BVH::QuantizedNode)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes); + NodeRef root = BVHNBuilderQuantizedVirtual::build(&bvh->alloc,CreateLeafQuantized(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings); + bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); + //bvh->layoutLargeNodes(pinfo.size()*0.005f); // FIXME: COPY LAYOUT FOR LARGE NODES !!! +#if PROFILE + }); +#endif + + /* clear temporary data for static geometry */ + if (scene && scene->isStaticAccel()) { + prims.clear(); + } + bvh->cleanup(); + bvh->postBuild(t0); + } + + void clear() { + prims.clear(); + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + + template + struct CreateLeafGrid + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + + __forceinline CreateLeafGrid (BVH* bvh, const SubGridBuildData * const sgrids) : bvh(bvh),sgrids(sgrids) {} + + __forceinline NodeRef operator() (const PrimRef* prims, const range& set, const FastAllocator::CachedAllocator& alloc) const + { + const size_t items = set.size(); //Primitive::blocks(n); + const size_t start = set.begin(); + + /* collect all subsets with unique geomIDs */ + assert(items <= N); + unsigned int geomIDs[N]; + unsigned int num_geomIDs = 1; + geomIDs[0] = prims[start].geomID(); + + for (size_t i=1;i* accel = (SubGridQBVHN*) alloc.malloc1(num_geomIDs*sizeof(SubGridQBVHN),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,num_geomIDs); + + for (size_t g=0;g(x,y,primID,bounds,geomIDs[g],pos); + } + + return node; + } + + BVH* bvh; + const SubGridBuildData * const sgrids; + }; + + + template + struct BVHNBuilderSAHGrid : public Builder + { + typedef BVHN BVH; + typedef typename BVHN::NodeRef NodeRef; + + BVH* bvh; + Scene* scene; + GridMesh* mesh; + mvector prims; + mvector sgrids; + GeneralBVHBuilder::Settings settings; + unsigned int geomID_ = std::numeric_limits::max(); + unsigned int numPreviousPrimitives = 0; + + BVHNBuilderSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) + : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD) {} + + BVHNBuilderSAHGrid (BVH* bvh, GridMesh* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) + : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), geomID_(geomID) {} + + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh && mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + } + + /* if we use the primrefarray for allocations we have to take it back from the BVH */ + if (settings.primrefarrayalloc != size_t(inf)) + bvh->alloc.unshare(prims); + + const size_t numGridPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(GridMesh::geom_type,false); + numPreviousPrimitives = numGridPrimitives; + + PrimInfo pinfo(empty); + size_t numPrimitives = 0; + + if (!mesh) + { + /* first run to get #primitives */ + + ParallelForForPrefixSumState pstate; + Scene::Iterator iter(scene); + + pstate.init(iter,size_t(1024)); + + /* iterate over all meshes in the scene */ + pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range& r, size_t k, size_t geomID) -> PrimInfo { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); jvalid(j)) continue; + BBox3fa bounds = empty; + const PrimRef prim(bounds,(unsigned)geomID,(unsigned)j); + if (!mesh->valid(j)) continue; + pinfo.add_center2(prim,mesh->getNumSubGrids(j)); + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + numPrimitives = pinfo.size(); + + /* resize arrays */ + sgrids.resize(numPrimitives); + prims.resize(numPrimitives); + + /* second run to fill primrefs and SubGridBuildData arrays */ + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + k = base.size(); + size_t p_index = k; + PrimInfo pinfo(empty); + for (size_t j=r.begin(); jvalid(j)) continue; + const GridMesh::Grid &g = mesh->grid(j); + for (unsigned int y=0; ybuildBounds(g,x,y,bounds)) continue; // get bounds of subgrid + const PrimRef prim(bounds,(unsigned)geomID,(unsigned)p_index); + pinfo.add_center2(prim); + sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); + prims[p_index++] = prim; + } + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + assert(pinfo.size() == numPrimitives); + } + else + { + ParallelPrefixSumState pstate; + /* iterate over all grids in a single mesh */ + pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range& r, const PrimInfo& base) -> PrimInfo + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); jvalid(j)) continue; + BBox3fa bounds = empty; + const PrimRef prim(bounds,geomID_,unsigned(j)); + pinfo.add_center2(prim,mesh->getNumSubGrids(j)); + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + numPrimitives = pinfo.size(); + /* resize arrays */ + sgrids.resize(numPrimitives); + prims.resize(numPrimitives); + + /* second run to fill primrefs and SubGridBuildData arrays */ + pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range& r, const PrimInfo& base) -> PrimInfo + { + + size_t p_index = base.size(); + PrimInfo pinfo(empty); + for (size_t j=r.begin(); jvalid(j)) continue; + const GridMesh::Grid &g = mesh->grid(j); + for (unsigned int y=0; ybuildBounds(g,x,y,bounds)) continue; // get bounds of subgrid + const PrimRef prim(bounds,geomID_,unsigned(p_index)); + pinfo.add_center2(prim); + sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); + prims[p_index++] = prim; + } + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + } + + /* no primitives */ + if (numPrimitives == 0) { + bvh->clear(); + prims.clear(); + sgrids.clear(); + return; + } + + double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH"); + + /* create primref array */ + settings.primrefarrayalloc = numPrimitives/1000; + if (settings.primrefarrayalloc < 1000) + settings.primrefarrayalloc = inf; + + /* enable os_malloc for two level build */ + if (mesh) + bvh->alloc.setOSallocation(true); + + /* initialize allocator */ + const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N); + const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN)); + + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes); + + /* pinfo might has zero size due to invalid geometry */ + if (unlikely(pinfo.size() == 0)) + { + bvh->clear(); + sgrids.clear(); + prims.clear(); + return; + } + + /* call BVH builder */ + NodeRef root = BVHNBuilderVirtual::build(&bvh->alloc,CreateLeafGrid>(bvh,sgrids.data()),bvh->scene->progressInterface,prims.data(),pinfo,settings); + bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); + bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f)); + + /* clear temporary array */ + sgrids.clear(); + + /* if we allocated using the primrefarray we have to keep it alive */ + if (settings.primrefarrayalloc != size_t(inf)) + bvh->alloc.share(prims); + + /* for static geometries we can do some cleanups */ + else if (scene && scene->isStaticAccel()) { + prims.clear(); + } + bvh->cleanup(); + bvh->postBuild(t0); + } + + void clear() { + prims.clear(); + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + + Builder* BVH4Triangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH4Triangle4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH4Triangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); } + + + Builder* BVH4QuantizedTriangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } +#if defined(__AVX__) + Builder* BVH8Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + + Builder* BVH8Triangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8Triangle4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8Triangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); } + Builder* BVH8QuantizedTriangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8QuantizedTriangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH4Quad4iMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH4Quad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH4Quad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); } + Builder* BVH4QuantizedQuad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH4QuantizedQuad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + +#if defined(__AVX__) + Builder* BVH8Quad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH8Quad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); } + Builder* BVH8QuantizedQuad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH8QuantizedQuad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH8Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); } + +#endif +#endif + +#if defined(EMBREE_GEOMETRY_USER) + + Builder* BVH4VirtualSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { + int minLeafSize = scene->device->object_accel_min_leaf_size; + int maxLeafSize = scene->device->object_accel_max_leaf_size; + return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type); + } + + Builder* BVH4VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { + return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,UserGeometry::geom_type); + } +#if defined(__AVX__) + + Builder* BVH8VirtualSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { + int minLeafSize = scene->device->object_accel_min_leaf_size; + int maxLeafSize = scene->device->object_accel_max_leaf_size; + return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type); + } + + Builder* BVH8VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { + return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,UserGeometry::geom_type); + } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); } + Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { + return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,gtype); + } +#if defined(__AVX__) + Builder* BVH8InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); } + Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { + return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,gtype); + } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_GRID) + Builder* BVH4GridMeshBuilderSAH (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,4,mode); } + Builder* BVH4GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4,mode); } // FIXME: check whether cost factors are correct + +#if defined(__AVX__) + Builder* BVH8GridMeshBuilderSAH (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,mesh,geomID,8,1.0f,8,8,mode); } + Builder* BVH8GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8,mode); } // FIXME: check whether cost factors are correct +#endif +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp new file mode 100644 index 00000000000..9c01553ec69 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp @@ -0,0 +1,705 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_builder.h" +#include "../builders/bvh_builder_msmblur.h" + +#include "../builders/primrefgen.h" +#include "../builders/splitter.h" + +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" + +#include "../common/state.h" + +// FIXME: remove after removing BVHNBuilderMBlurRootTimeSplitsSAH +#include "../../common/algorithms/parallel_for_for.h" +#include "../../common/algorithms/parallel_for_for_prefix_sum.h" + + +namespace embree +{ + namespace isa + { + +#if 0 + template + struct CreateMBlurLeaf + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB NodeRecordMB; + + __forceinline CreateMBlurLeaf (BVH* bvh, PrimRef* prims, size_t time) : bvh(bvh), prims(prims), time(time) {} + + __forceinline NodeRecordMB operator() (const PrimRef* prims, const range& set, const FastAllocator::CachedAllocator& alloc) const + { + size_t items = Primitive::blocks(set.size()); + size_t start = set.begin(); + for (size_t i=start; iencodeLeaf((char*)accel,items); + + LBBox3fa allBounds = empty; + for (size_t i=0; iscene, time)); + + return NodeRecordMB(node,allBounds); + } + + BVH* bvh; + PrimRef* prims; + size_t time; + }; +#endif + + template + struct CreateMSMBlurLeaf + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB4D NodeRecordMB4D; + + __forceinline CreateMSMBlurLeaf (BVH* bvh) : bvh(bvh) {} + + __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const + { + size_t items = Primitive::blocks(current.prims.size()); + size_t start = current.prims.begin(); + size_t end = current.prims.end(); + for (size_t i=start; iencodeLeaf((char*)accel,items); + LBBox3fa allBounds = empty; + for (size_t i=0; idata(), start, current.prims.end(), bvh->scene, current.prims.time_range)); + return NodeRecordMB4D(node,allBounds,current.prims.time_range); + } + + BVH* bvh; + }; + + /* Motion blur BVH with 4D nodes and internal time splits */ + template + struct BVHNBuilderMBlurSAH : public Builder + { + typedef BVHN BVH; + typedef typename BVHN::NodeRef NodeRef; + typedef typename BVHN::NodeRecordMB NodeRecordMB; + typedef typename BVHN::AABBNodeMB AABBNodeMB; + + BVH* bvh; + Scene* scene; + const size_t sahBlockSize; + const float intCost; + const size_t minLeafSize; + const size_t maxLeafSize; + const Geometry::GTypeMask gtype_; + + BVHNBuilderMBlurSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) + : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks)), gtype_(gtype) {} + + void build() + { + /* skip build for empty scene */ + const size_t numPrimitives = scene->getNumPrimitives(gtype_,true); + if (numPrimitives == 0) { bvh->clear(); return; } + + double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAH"); + +#if PROFILE + profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) { +#endif + + //const size_t numTimeSteps = scene->getNumTimeSteps(); + //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1); + + /*if (numTimeSegments == 1) + buildSingleSegment(numPrimitives); + else*/ + buildMultiSegment(numPrimitives); + +#if PROFILE + }); +#endif + + /* clear temporary data for static geometry */ + bvh->cleanup(); + bvh->postBuild(t0); + } + +#if 0 // No longer compatible when time_ranges are present for geometries. Would have to create temporal nodes sometimes, and put only a single geometry into leaf. + void buildSingleSegment(size_t numPrimitives) + { + /* create primref array */ + mvector prims(scene->device,numPrimitives); + const PrimInfo pinfo = createPrimRefArrayMBlur(scene,gtype_,prims,bvh->scene->progressInterface,0); + /* early out if no valid primitives */ + if (pinfo.size() == 0) { bvh->clear(); return; } + /* estimate acceleration structure size */ + const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + /* settings for BVH build */ + GeneralBVHBuilder::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + settings.logBlockSize = bsr(sahBlockSize); + settings.minLeafSize = min(minLeafSize,maxLeafSize); + settings.maxLeafSize = maxLeafSize; + settings.travCost = travCost; + settings.intCost = intCost; + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + /* build hierarchy */ + auto root = BVHBuilderBinnedSAH::build + (typename BVH::CreateAlloc(bvh),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::Set(), + CreateMBlurLeaf(bvh,prims.data(),0),bvh->scene->progressInterface, + prims.data(),pinfo,settings); + + bvh->set(root.ref,root.lbounds,pinfo.size()); + } +#endif + + void buildMultiSegment(size_t numPrimitives) + { + /* create primref array */ + mvector prims(scene->device,numPrimitives); + PrimInfoMB pinfo = createPrimRefArrayMSMBlur(scene,gtype_,prims,bvh->scene->progressInterface); + + /* early out if no valid primitives */ + if (pinfo.size() == 0) { bvh->clear(); return; } + + /* estimate acceleration structure size */ + const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + /* settings for BVH build */ + BVHBuilderMSMBlur::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxDepth; + settings.logBlockSize = bsr(sahBlockSize); + settings.minLeafSize = min(minLeafSize,maxLeafSize); + settings.maxLeafSize = maxLeafSize; + settings.travCost = travCost; + settings.intCost = intCost; + settings.singleLeafTimeSegment = Primitive::singleTimeSegment; + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + /* build hierarchy */ + auto root = + BVHBuilderMSMBlur::build(prims,pinfo,scene->device, + RecalculatePrimRef(scene), + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNodeMB4D::Create(), + typename BVH::AABBNodeMB4D::Set(), + CreateMSMBlurLeaf(bvh), + bvh->scene->progressInterface, + settings); + + bvh->set(root.ref,root.lbounds,pinfo.num_time_segments); + } + + void clear() { + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + struct GridRecalculatePrimRef + { + Scene* scene; + const SubGridBuildData * const sgrids; + + __forceinline GridRecalculatePrimRef (Scene* scene, const SubGridBuildData * const sgrids) + : scene(scene), sgrids(sgrids) {} + + __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const + { + const unsigned int geomID = prim.geomID(); + const GridMesh* mesh = scene->get(geomID); + const unsigned int buildID = prim.primID(); + const SubGridBuildData &subgrid = sgrids[buildID]; + const unsigned int primID = subgrid.primID; + const size_t x = subgrid.x(); + const size_t y = subgrid.y(); + const LBBox3fa lbounds = mesh->linearBounds(mesh->grid(primID),x,y,time_range); + const unsigned num_time_segments = mesh->numTimeSegments(); + const range tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, num_time_segments, geomID, buildID); + } + + __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const { + const unsigned int geomID = prim.geomID(); + const GridMesh* mesh = scene->get(geomID); + const unsigned int buildID = prim.primID(); + const SubGridBuildData &subgrid = sgrids[buildID]; + const unsigned int primID = subgrid.primID; + const size_t x = subgrid.x(); + const size_t y = subgrid.y(); + return mesh->linearBounds(mesh->grid(primID),x,y,time_range); + } + + }; + + template + struct CreateMSMBlurLeafGrid + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB4D NodeRecordMB4D; + + __forceinline CreateMSMBlurLeafGrid (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) : scene(scene), bvh(bvh), sgrids(sgrids) {} + + __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const + { + const size_t items = current.prims.size(); + const size_t start = current.prims.begin(); + + const PrimRefMB* prims = current.prims.prims->data(); + /* collect all subsets with unique geomIDs */ + assert(items <= N); + unsigned int geomIDs[N]; + unsigned int num_geomIDs = 1; + geomIDs[0] = prims[start].geomID(); + + for (size_t i=1;i* accel = (SubGridMBQBVHN*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN),BVH::byteAlignment); + typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs); + + LBBox3fa allBounds = empty; + + for (size_t g=0;gget(geomIDs[g]); + unsigned int x[N]; + unsigned int y[N]; + unsigned int primID[N]; + BBox3fa bounds0[N]; + BBox3fa bounds1[N]; + unsigned int pos = 0; + for (size_t i=0;ilinearBounds(mesh->grid(sgrid_bd.primID),x,y,current.prims.time_range); + allBounds.extend(newBounds); + bounds0[pos] = newBounds.bounds0; + bounds1[pos] = newBounds.bounds1; + pos++; + } + assert(pos <= N); + new (&accel[g]) SubGridMBQBVHN(x,y,primID,bounds0,bounds1,geomIDs[g],current.prims.time_range.lower,1.0f/current.prims.time_range.size(),pos); + } + return NodeRecordMB4D(node,allBounds,current.prims.time_range); + } + + Scene *scene; + BVH* bvh; + const SubGridBuildData * const sgrids; + }; + +#if 0 + template + struct CreateLeafGridMB + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB NodeRecordMB; + + __forceinline CreateLeafGridMB (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) + : scene(scene), bvh(bvh), sgrids(sgrids) {} + + __forceinline NodeRecordMB operator() (const PrimRef* prims, const range& set, const FastAllocator::CachedAllocator& alloc) const + { + const size_t items = set.size(); + const size_t start = set.begin(); + + /* collect all subsets with unique geomIDs */ + assert(items <= N); + unsigned int geomIDs[N]; + unsigned int num_geomIDs = 1; + geomIDs[0] = prims[start].geomID(); + + for (size_t i=1;i* accel = (SubGridMBQBVHN*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN),BVH::byteAlignment); + typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs); + + LBBox3fa allBounds = empty; + + for (size_t g=0;gget(geomIDs[g]); + + unsigned int x[N]; + unsigned int y[N]; + unsigned int primID[N]; + BBox3fa bounds0[N]; + BBox3fa bounds1[N]; + unsigned int pos = 0; + for (size_t i=0;ibuildBounds(mesh->grid(sgrid_bd.primID),x,y,0,bounds0[pos]); + bool MAYBE_UNUSED valid1 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,1,bounds1[pos]); + assert(valid0); + assert(valid1); + allBounds.extend(LBBox3fa(bounds0[pos],bounds1[pos])); + pos++; + } + new (&accel[g]) SubGridMBQBVHN(x,y,primID,bounds0,bounds1,geomIDs[g],0.0f,1.0f,pos); + } + return NodeRecordMB(node,allBounds); + } + + Scene *scene; + BVH* bvh; + const SubGridBuildData * const sgrids; + }; +#endif + + + /* Motion blur BVH with 4D nodes and internal time splits */ + template + struct BVHNBuilderMBlurSAHGrid : public Builder + { + typedef BVHN BVH; + typedef typename BVHN::NodeRef NodeRef; + typedef typename BVHN::NodeRecordMB NodeRecordMB; + typedef typename BVHN::AABBNodeMB AABBNodeMB; + + BVH* bvh; + Scene* scene; + const size_t sahBlockSize; + const float intCost; + const size_t minLeafSize; + const size_t maxLeafSize; + mvector sgrids; + + + BVHNBuilderMBlurSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize) + : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,BVH::maxLeafBlocks)), sgrids(scene->device,0) {} + + + PrimInfo createPrimRefArrayMBlurGrid(Scene* scene, mvector& prims, BuildProgressMonitor& progressMonitor, size_t itime) + { + /* first run to get #primitives */ + ParallelForForPrefixSumState pstate; + Scene::Iterator iter(scene); + + pstate.init(iter,size_t(1024)); + + /* iterate over all meshes in the scene */ + PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range& r, size_t k, size_t geomID) -> PrimInfo { + + PrimInfo pinfo(empty); + for (size_t j=r.begin(); jvalid(j,range(0,1))) continue; + BBox3fa bounds = empty; + const PrimRef prim(bounds,unsigned(geomID),unsigned(j)); + pinfo.add_center2(prim,mesh->getNumSubGrids(j)); + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + size_t numPrimitives = pinfo.size(); + if (numPrimitives == 0) return pinfo; + + /* resize arrays */ + sgrids.resize(numPrimitives); + prims.resize(numPrimitives); + + /* second run to fill primrefs and SubGridBuildData arrays */ + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + + k = base.size(); + size_t p_index = k; + PrimInfo pinfo(empty); + for (size_t j=r.begin(); jgrid(j); + if (!mesh->valid(j,range(0,1))) continue; + + for (unsigned int y=0; ybuildBounds(g,x,y,itime,bounds)) continue; // get bounds of subgrid + const PrimRef prim(bounds,unsigned(geomID),unsigned(p_index)); + pinfo.add_center2(prim); + sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); + prims[p_index++] = prim; + } + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + assert(pinfo.size() == numPrimitives); + return pinfo; + } + + PrimInfoMB createPrimRefArrayMSMBlurGrid(Scene* scene, mvector& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f)) + { + /* first run to get #primitives */ + ParallelForForPrefixSumState pstate; + Scene::Iterator iter(scene); + + pstate.init(iter,size_t(1024)); + /* iterate over all meshes in the scene */ + PrimInfoMB pinfoMB = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range& r, size_t k, size_t /*geomID*/) -> PrimInfoMB { + + PrimInfoMB pinfoMB(empty); + for (size_t j=r.begin(); jvalid(j, mesh->timeSegmentRange(t0t1))) continue; + LBBox3fa bounds(empty); + PrimInfoMB gridMB(0,mesh->getNumSubGrids(j)); + pinfoMB.merge(gridMB); + } + return pinfoMB; + }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); + + size_t numPrimitives = pinfoMB.size(); + if (numPrimitives == 0) return pinfoMB; + + /* resize arrays */ + sgrids.resize(numPrimitives); + prims.resize(numPrimitives); + /* second run to fill primrefs and SubGridBuildData arrays */ + pinfoMB = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB { + + k = base.size(); + size_t p_index = k; + PrimInfoMB pinfoMB(empty); + for (size_t j=r.begin(); jvalid(j, mesh->timeSegmentRange(t0t1))) continue; + const GridMesh::Grid &g = mesh->grid(j); + + for (unsigned int y=0; ylinearBounds(g,x,y,t0t1),mesh->numTimeSegments(),mesh->time_range,mesh->numTimeSegments(),unsigned(geomID),unsigned(p_index)); + pinfoMB.add_primref(prim); + sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); + prims[p_index++] = prim; + } + } + return pinfoMB; + }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); + + assert(pinfoMB.size() == numPrimitives); + pinfoMB.time_range = t0t1; + return pinfoMB; + } + + void build() + { + /* skip build for empty scene */ + const size_t numPrimitives = scene->getNumPrimitives(GridMesh::geom_type,true); + if (numPrimitives == 0) { bvh->clear(); return; } + + double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAHGrid"); + + //const size_t numTimeSteps = scene->getNumTimeSteps(); + //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1); + //if (numTimeSegments == 1) + // buildSingleSegment(numPrimitives); + //else + buildMultiSegment(numPrimitives); + + /* clear temporary data for static geometry */ + bvh->cleanup(); + bvh->postBuild(t0); + } + +#if 0 + void buildSingleSegment(size_t numPrimitives) + { + /* create primref array */ + mvector prims(scene->device,numPrimitives); + const PrimInfo pinfo = createPrimRefArrayMBlurGrid(scene,prims,bvh->scene->progressInterface,0); + /* early out if no valid primitives */ + if (pinfo.size() == 0) { bvh->clear(); return; } + + /* estimate acceleration structure size */ + const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N); + //TODO: check leaf_bytes + const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + /* settings for BVH build */ + GeneralBVHBuilder::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + settings.logBlockSize = bsr(sahBlockSize); + settings.minLeafSize = min(minLeafSize,maxLeafSize); + settings.maxLeafSize = maxLeafSize; + settings.travCost = travCost; + settings.intCost = intCost; + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + /* build hierarchy */ + auto root = BVHBuilderBinnedSAH::build + (typename BVH::CreateAlloc(bvh), + typename BVH::AABBNodeMB::Create(), + typename BVH::AABBNodeMB::Set(), + CreateLeafGridMB(scene,bvh,sgrids.data()), + bvh->scene->progressInterface, + prims.data(),pinfo,settings); + + bvh->set(root.ref,root.lbounds,pinfo.size()); + } +#endif + + void buildMultiSegment(size_t numPrimitives) + { + /* create primref array */ + mvector prims(scene->device,numPrimitives); + PrimInfoMB pinfo = createPrimRefArrayMSMBlurGrid(scene,prims,bvh->scene->progressInterface); + + /* early out if no valid primitives */ + if (pinfo.size() == 0) { bvh->clear(); return; } + + + + GridRecalculatePrimRef recalculatePrimRef(scene,sgrids.data()); + + /* estimate acceleration structure size */ + const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N); + //FIXME: check leaf_bytes + //const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(SubGridQBVHN)); + const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN)); + + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + /* settings for BVH build */ + BVHBuilderMSMBlur::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxDepth; + settings.logBlockSize = bsr(sahBlockSize); + settings.minLeafSize = min(minLeafSize,maxLeafSize); + settings.maxLeafSize = maxLeafSize; + settings.travCost = travCost; + settings.intCost = intCost; + settings.singleLeafTimeSegment = false; + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + /* build hierarchy */ + auto root = + BVHBuilderMSMBlur::build(prims,pinfo,scene->device, + recalculatePrimRef, + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNodeMB4D::Create(), + typename BVH::AABBNodeMB4D::Set(), + CreateMSMBlurLeafGrid(scene,bvh,sgrids.data()), + bvh->scene->progressInterface, + settings); + bvh->set(root.ref,root.lbounds,pinfo.num_time_segments); + } + + void clear() { + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } + Builder* BVH4Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4vMB>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } +#if defined(__AVX__) + Builder* BVH8Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } + Builder* BVH8Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4vMB>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,QuadMesh,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); } +#if defined(__AVX__) + Builder* BVH8Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,QuadMesh,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH4VirtualMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { + int minLeafSize = scene->device->object_accel_mb_min_leaf_size; + int maxLeafSize = scene->device->object_accel_mb_max_leaf_size; + return new BVHNBuilderMBlurSAH<4,UserGeometry,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY); + } +#if defined(__AVX__) + Builder* BVH8VirtualMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { + int minLeafSize = scene->device->object_accel_mb_min_leaf_size; + int maxLeafSize = scene->device->object_accel_mb_max_leaf_size; + return new BVHNBuilderMBlurSAH<8,UserGeometry,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY); + } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); } +#if defined(__AVX__) + Builder* BVH8InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_GRID) + Builder* BVH4GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4); } +#if defined(__AVX__) + Builder* BVH8GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8); } +#endif +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp new file mode 100644 index 00000000000..285b38c39d6 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp @@ -0,0 +1,201 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_builder.h" + +#include "../builders/primrefgen.h" +#include "../builders/primrefgen_presplit.h" +#include "../builders/splitter.h" + +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" + +#include "../common/state.h" + +namespace embree +{ + namespace isa + { + template + struct CreateLeafSpatial + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + + __forceinline CreateLeafSpatial (BVH* bvh) : bvh(bvh) {} + + __forceinline NodeRef operator() (const PrimRef* prims, const range& set, const FastAllocator::CachedAllocator& alloc) const + { + size_t n = set.size(); + size_t items = Primitive::blocks(n); + size_t start = set.begin(); + Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items); + for (size_t i=0; iscene); + } + return node; + } + + BVH* bvh; + }; + + template + struct BVHNBuilderFastSpatialSAH : public Builder + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + BVH* bvh; + Scene* scene; + Mesh* mesh; + mvector prims0; + GeneralBVHBuilder::Settings settings; + const float splitFactor; + unsigned int geomID_ = std::numeric_limits::max(); + unsigned int numPreviousPrimitives = 0; + + BVHNBuilderFastSpatialSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) + : bvh(bvh), scene(scene), mesh(nullptr), prims0(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), + splitFactor(scene->device->max_spatial_split_replications) {} + + BVHNBuilderFastSpatialSAH (BVH* bvh, Mesh* mesh, const unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) + : bvh(bvh), scene(nullptr), mesh(mesh), prims0(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), + splitFactor(scene->device->max_spatial_split_replications), geomID_(geomID) {} + + // FIXME: shrink bvh->alloc in destructor here and in other builders too + + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh && mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + } + + /* skip build for empty scene */ + const size_t numOriginalPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(Mesh::geom_type,false); + numPreviousPrimitives = numOriginalPrimitives; + if (numOriginalPrimitives == 0) { + prims0.clear(); + bvh->clear(); + return; + } + + const unsigned int maxGeomID = mesh ? geomID_ : scene->getMaxGeomID(); + const bool usePreSplits = scene->device->useSpatialPreSplits || (maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS))); + double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + (usePreSplits ? "BuilderFastSpatialPresplitSAH" : "BuilderFastSpatialSAH")); + + /* create primref array */ + const size_t numSplitPrimitives = max(numOriginalPrimitives,size_t(splitFactor*numOriginalPrimitives)); + prims0.resize(numSplitPrimitives); + + /* enable os_malloc for two level build */ + if (mesh) + bvh->alloc.setOSallocation(true); + + NodeRef root(0); + PrimInfo pinfo; + + + if (likely(usePreSplits)) + { + /* spatial presplit SAH BVH builder */ + pinfo = mesh ? + createPrimRefArray_presplit(mesh,maxGeomID,numOriginalPrimitives,prims0,bvh->scene->progressInterface) : + createPrimRefArray_presplit(scene,Mesh::geom_type,false,numOriginalPrimitives,prims0,bvh->scene->progressInterface); + + const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + + /* call BVH builder */ + root = BVHNBuilderVirtual::build(&bvh->alloc,CreateLeafSpatial(bvh),bvh->scene->progressInterface,prims0.data(),pinfo,settings); + } + else + { + /* standard spatial split SAH BVH builder */ + pinfo = mesh ? + createPrimRefArray(mesh,geomID_,/*numSplitPrimitives,*/prims0,bvh->scene->progressInterface) : + createPrimRefArray(scene,Mesh::geom_type,false,/*numSplitPrimitives,*/prims0,bvh->scene->progressInterface); + + Splitter splitter(scene); + + const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + + /* call BVH builder */ + root = BVHBuilderBinnedFastSpatialSAH::build( + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNode::Create2(), + typename BVH::AABBNode::Set2(), + CreateLeafSpatial(bvh), + splitter, + bvh->scene->progressInterface, + prims0.data(), + numSplitPrimitives, + pinfo,settings); + + /* ==================== */ + } + + bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); + bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f)); + + /* clear temporary data for static geometry */ + if (scene && scene->isStaticAccel()) { + prims0.clear(); + } + bvh->cleanup(); + bvh->postBuild(t0); + } + + void clear() { + prims0.clear(); + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + + Builder* BVH4Triangle4SceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } + Builder* BVH4Triangle4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } + Builder* BVH4Triangle4iSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4i,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } + +#if defined(__AVX__) + Builder* BVH8Triangle4SceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); } + Builder* BVH8Triangle4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,QuadMesh,Quad4v,QuadSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } + +#if defined(__AVX__) + Builder* BVH8Quad4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,QuadMesh,Quad4v,QuadSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); } +#endif + +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp new file mode 100644 index 00000000000..1a78f347ac3 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp @@ -0,0 +1,377 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_builder_twolevel.h" +#include "bvh_statistics.h" +#include "../builders/bvh_builder_sah.h" +#include "../common/scene_line_segments.h" +#include "../common/scene_triangle_mesh.h" +#include "../common/scene_quad_mesh.h" + +#define PROFILE 0 + +namespace embree +{ + namespace isa + { + template + BVHNBuilderTwoLevel::BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder, const size_t singleThreadThreshold) + : bvh(bvh), scene(scene), refs(scene->device,0), prims(scene->device,0), singleThreadThreshold(singleThreadThreshold), gtype(gtype), useMortonBuilder_(useMortonBuilder) {} + + template + BVHNBuilderTwoLevel::~BVHNBuilderTwoLevel () { + } + + // =========================================================================== + // =========================================================================== + // =========================================================================== + + template + void BVHNBuilderTwoLevel::build() + { + /* delete some objects */ + size_t num = scene->size(); + if (num < bvh->objects.size()) { + parallel_for(num, bvh->objects.size(), [&] (const range& r) { + for (size_t i=r.begin(); iobjects[i]; bvh->objects[i] = nullptr; + } + }); + } + +#if PROFILE + while(1) +#endif + { + /* reset memory allocator */ + bvh->alloc.reset(); + + /* skip build for empty scene */ + const size_t numPrimitives = scene->getNumPrimitives(gtype,false); + + if (numPrimitives == 0) { + prims.resize(0); + bvh->set(BVH::emptyNode,empty,0); + return; + } + + /* calculate the size of the entire BVH */ + const size_t numLeafBlocks = Primitive::blocks(numPrimitives); + const size_t node_bytes = 2*numLeafBlocks*sizeof(typename BVH::AABBNode)/N; + const size_t leaf_bytes = size_t(1.2*numLeafBlocks*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderTwoLevel"); + + /* resize object array if scene got larger */ + if (bvh->objects.size() < num) bvh->objects.resize(num); + if (builders.size() < num) builders.resize(num); + resizeRefsList (); + nextRef.store(0); + + /* create acceleration structures */ + parallel_for(size_t(0), num, [&] (const range& r) + { + for (size_t objectID=r.begin(); objectIDgetSafe(objectID); + + /* ignore meshes we do not support */ + if (mesh == nullptr || mesh->numTimeSteps != 1) + continue; + + if (isSmallGeometry(mesh)) { + setupSmallBuildRefBuilder (objectID, mesh); + } else { + setupLargeBuildRefBuilder (objectID, mesh); + } + } + }); + + /* parallel build of acceleration structures */ + parallel_for(size_t(0), num, [&] (const range& r) + { + for (size_t objectID=r.begin(); objectIDgetSafe(objectID); + if (mesh == nullptr || !mesh->isEnabled() || mesh->numTimeSteps != 1) + continue; + + builders[objectID]->attachBuildRefs (this); + } + }); + + +#if PROFILE + double d0 = getSeconds(); +#endif + /* fast path for single geometry scenes */ + if (nextRef == 1) { + bvh->set(refs[0].node,LBBox3fa(refs[0].bounds()),numPrimitives); + } + + else + { + /* open all large nodes */ + refs.resize(nextRef); + + /* this probably needs some more tuning */ + const size_t extSize = max(max((size_t)SPLIT_MIN_EXT_SPACE,refs.size()*SPLIT_MEMORY_RESERVE_SCALE),size_t((float)numPrimitives / SPLIT_MEMORY_RESERVE_FACTOR)); + +#if !ENABLE_DIRECT_SAH_MERGE_BUILDER + +#if ENABLE_OPEN_SEQUENTIAL + open_sequential(extSize); +#endif + /* compute PrimRefs */ + prims.resize(refs.size()); +#endif + +#if defined(TASKING_TBB) && defined(__AVX512ER__) && USE_TASK_ARENA // KNL + tbb::task_arena limited(min(32,(int)TaskScheduler::threadCount())); + limited.execute([&] +#endif + { +#if ENABLE_DIRECT_SAH_MERGE_BUILDER + + const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(), PrimInfo(empty), [&] (const range& r) -> PrimInfo { + + PrimInfo pinfo(empty); + for (size_t i=r.begin(); i& r) -> PrimInfo { + + PrimInfo pinfo(empty); + for (size_t i=r.begin(); iset(BVH::emptyNode,empty,0); + + /* otherwise build toplevel hierarchy */ + else + { + /* settings for BVH build */ + GeneralBVHBuilder::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + settings.logBlockSize = bsr(N); + settings.minLeafSize = 1; + settings.maxLeafSize = 1; + settings.travCost = 1.0f; + settings.intCost = 1.0f; + settings.singleThreadThreshold = singleThreadThreshold; + +#if ENABLE_DIRECT_SAH_MERGE_BUILDER + + refs.resize(extSize); + + NodeRef root = BVHBuilderBinnedOpenMergeSAH::build( + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNode::Create2(), + typename BVH::AABBNode::Set2(), + + [&] (const BuildRef* refs, const range& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef { + assert(range.size() == 1); + return (NodeRef) refs[range.begin()].node; + }, + [&] (BuildRef &bref, BuildRef *refs) -> size_t { + return openBuildRef(bref,refs); + }, + [&] (size_t dn) { bvh->scene->progressMonitor(0); }, + refs.data(),extSize,pinfo,settings); +#else + NodeRef root = BVHBuilderBinnedSAH::build( + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNode::Create2(), + typename BVH::AABBNode::Set2(), + + [&] (const PrimRef* prims, const range& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef { + assert(range.size() == 1); + return (NodeRef) prims[range.begin()].ID(); + }, + [&] (size_t dn) { bvh->scene->progressMonitor(0); }, + prims.data(),pinfo,settings); +#endif + + + bvh->set(root,LBBox3fa(pinfo.geomBounds),numPrimitives); + } + } +#if defined(TASKING_TBB) && defined(__AVX512ER__) && USE_TASK_ARENA // KNL + ); +#endif + + } + + bvh->alloc.cleanup(); + bvh->postBuild(t0); +#if PROFILE + double d1 = getSeconds(); + std::cout << "TOP_LEVEL OPENING/REBUILD TIME " << 1000.0*(d1-d0) << " ms" << std::endl; +#endif + } + + } + + template + void BVHNBuilderTwoLevel::deleteGeometry(size_t geomID) + { + if (geomID >= bvh->objects.size()) return; + if (builders[geomID]) builders[geomID].reset(); + delete bvh->objects [geomID]; bvh->objects [geomID] = nullptr; + } + + template + void BVHNBuilderTwoLevel::clear() + { + for (size_t i=0; iobjects.size(); i++) + if (bvh->objects[i]) bvh->objects[i]->clear(); + + for (size_t i=0; i + void BVHNBuilderTwoLevel::open_sequential(const size_t extSize) + { + if (refs.size() == 0) + return; + + refs.reserve(extSize); + +#if 1 + for (size_t i=0;ichild(i) == BVH::emptyNode) continue; + refs.push_back(BuildRef(node->bounds(i),node->child(i))); + +#if 1 + NodeRef ref_pre = node->child(i); + if (ref_pre.isAABBNode()) + ref_pre.prefetch(); +#endif + std::push_heap (refs.begin(),refs.end()); + } + } + } + + template + void BVHNBuilderTwoLevel::setupSmallBuildRefBuilder (size_t objectID, Mesh const * const /*mesh*/) + { + if (builders[objectID] == nullptr || // new mesh + dynamic_cast(builders[objectID].get()) == nullptr) // size change resulted in large->small change + { + builders[objectID].reset (new RefBuilderSmall(objectID)); + } + } + + template + void BVHNBuilderTwoLevel::setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh) + { + if (bvh->objects[objectID] == nullptr || // new mesh + builders[objectID]->meshQualityChanged (mesh->quality) || // changed build quality + dynamic_cast(builders[objectID].get()) == nullptr) // size change resulted in small->large change + { + Builder* builder = nullptr; + delete bvh->objects[objectID]; + createMeshAccel(objectID, builder); + builders[objectID].reset (new RefBuilderLarge(objectID, builder, mesh->quality)); + } + } + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } + Builder* BVH4BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4v>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } + Builder* BVH4BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,QuadMesh,Quad4v>((BVH4*)bvh,scene,QuadMesh::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH4BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,UserGeometry,Object>((BVH4*)bvh,scene,UserGeometry::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,gtype,useMortonBuilder); + } +#endif + +#if defined(__AVX__) +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH8BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } + Builder* BVH8BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4v>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } + Builder* BVH8BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH8BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,QuadMesh,Quad4v>((BVH8*)bvh,scene,QuadMesh::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH8BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,UserGeometry,Object>((BVH8*)bvh,scene,UserGeometry::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH8BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,gtype,useMortonBuilder); + } +#endif + +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h new file mode 100644 index 00000000000..8f57c3b4069 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h @@ -0,0 +1,263 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "bvh_builder_twolevel_internal.h" +#include "bvh.h" +#include "../common/primref.h" +#include "../builders/priminfo.h" +#include "../builders/primrefgen.h" + +/* new open/merge builder */ +#define ENABLE_DIRECT_SAH_MERGE_BUILDER 1 +#define ENABLE_OPEN_SEQUENTIAL 0 +#define SPLIT_MEMORY_RESERVE_FACTOR 1000 +#define SPLIT_MEMORY_RESERVE_SCALE 2 +#define SPLIT_MIN_EXT_SPACE 1000 + +namespace embree +{ + namespace isa + { + template + class BVHNBuilderTwoLevel : public Builder + { + typedef BVHN BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::NodeRef NodeRef; + + __forceinline static bool isSmallGeometry(Mesh* mesh) { + return mesh->size() <= 4; + } + + public: + + typedef void (*createMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); + + struct BuildRef : public PrimRef + { + public: + __forceinline BuildRef () {} + + __forceinline BuildRef (const BBox3fa& bounds, NodeRef node) + : PrimRef(bounds,(size_t)node), node(node) + { + if (node.isLeaf()) + bounds_area = 0.0f; + else + bounds_area = area(this->bounds()); + } + + /* used by the open/merge bvh builder */ + __forceinline BuildRef (const BBox3fa& bounds, NodeRef node, const unsigned int geomID, const unsigned int numPrimitives) + : PrimRef(bounds,geomID,numPrimitives), node(node) + { + /* important for relative buildref ordering */ + if (node.isLeaf()) + bounds_area = 0.0f; + else + bounds_area = area(this->bounds()); + } + + __forceinline size_t size() const { + return primID(); + } + + friend bool operator< (const BuildRef& a, const BuildRef& b) { + return a.bounds_area < b.bounds_area; + } + + friend __forceinline embree_ostream operator<<(embree_ostream cout, const BuildRef& ref) { + return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", center2 = " << ref.center2() << ", geomID = " << ref.geomID() << ", numPrimitives = " << ref.numPrimitives() << ", bounds_area = " << ref.bounds_area << " }"; + } + + __forceinline unsigned int numPrimitives() const { return primID(); } + + public: + NodeRef node; + float bounds_area; + }; + + + __forceinline size_t openBuildRef(BuildRef &bref, BuildRef *const refs) { + if (bref.node.isLeaf()) + { + refs[0] = bref; + return 1; + } + NodeRef ref = bref.node; + unsigned int geomID = bref.geomID(); + unsigned int numPrims = max((unsigned int)bref.numPrimitives() / N,(unsigned int)1); + AABBNode* node = ref.getAABBNode(); + size_t n = 0; + for (size_t i=0; ichild(i) == BVH::emptyNode) continue; + refs[i] = BuildRef(node->bounds(i),node->child(i),geomID,numPrims); + n++; + } + assert(n > 1); + return n; + } + + /*! Constructor. */ + BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype = Mesh::geom_type, bool useMortonBuilder = false, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD); + + /*! Destructor */ + ~BVHNBuilderTwoLevel (); + + /*! builder entry point */ + void build(); + void deleteGeometry(size_t geomID); + void clear(); + + void open_sequential(const size_t extSize); + + private: + + class RefBuilderBase { + public: + virtual ~RefBuilderBase () {} + virtual void attachBuildRefs (BVHNBuilderTwoLevel* builder) = 0; + virtual bool meshQualityChanged (RTCBuildQuality currQuality) = 0; + }; + + class RefBuilderSmall : public RefBuilderBase { + public: + + RefBuilderSmall (size_t objectID) + : objectID_ (objectID) {} + + void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder) { + + Mesh* mesh = topBuilder->scene->template getSafe(objectID_); + size_t meshSize = mesh->size(); + assert(isSmallGeometry(mesh)); + + mvector prefs(topBuilder->scene->device, meshSize); + auto pinfo = createPrimRefArray(mesh,objectID_,prefs,topBuilder->bvh->scene->progressInterface); + + size_t begin=0; + while (begin < pinfo.size()) + { + Primitive* accel = (Primitive*) topBuilder->bvh->alloc.getCachedAllocator().malloc1(sizeof(Primitive),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,1); + accel->fill(prefs.data(),begin,pinfo.size(),topBuilder->bvh->scene); + + /* create build primitive */ +#if ENABLE_DIRECT_SAH_MERGE_BUILDER + topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node,(unsigned int)objectID_,1); +#else + topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node); +#endif + } + assert(begin == pinfo.size()); + } + + bool meshQualityChanged (RTCBuildQuality /*currQuality*/) { + return false; + } + + size_t objectID_; + }; + + class RefBuilderLarge : public RefBuilderBase { + public: + + RefBuilderLarge (size_t objectID, const Ref& builder, RTCBuildQuality quality) + : objectID_ (objectID), builder_ (builder), quality_ (quality) {} + + void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder) + { + BVH* object = topBuilder->getBVH(objectID_); assert(object); + + /* build object if it got modified */ + if (topBuilder->isGeometryModified(objectID_)) + builder_->build(); + + /* create build primitive */ + if (!object->getBounds().empty()) + { +#if ENABLE_DIRECT_SAH_MERGE_BUILDER + Mesh* mesh = topBuilder->getMesh(objectID_); + topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root,(unsigned int)objectID_,(unsigned int)mesh->size()); +#else + topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root); +#endif + } + } + + bool meshQualityChanged (RTCBuildQuality currQuality) { + return currQuality != quality_; + } + + private: + size_t objectID_; + Ref builder_; + RTCBuildQuality quality_; + }; + + void setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh); + void setupSmallBuildRefBuilder (size_t objectID, Mesh const * const mesh); + + BVH* getBVH (size_t objectID) { + return this->bvh->objects[objectID]; + } + Mesh* getMesh (size_t objectID) { + return this->scene->template getSafe(objectID); + } + bool isGeometryModified (size_t objectID) { + return this->scene->isGeometryModified(objectID); + } + + void resizeRefsList () + { + size_t num = parallel_reduce (size_t(0), scene->size(), size_t(0), + [this](const range& r)->size_t { + size_t c = 0; + for (auto i=r.begin(); igetSafe(i); + if (mesh == nullptr || mesh->numTimeSteps != 1) + continue; + size_t meshSize = mesh->size(); + c += isSmallGeometry(mesh) ? Primitive::blocks(meshSize) : 1; + } + return c; + }, + std::plus() + ); + + if (refs.size() < num) { + refs.resize(num); + } + } + + void createMeshAccel (size_t geomID, Builder*& builder) + { + bvh->objects[geomID] = new BVH(Primitive::type,scene); + BVH* accel = bvh->objects[geomID]; + auto mesh = scene->getSafe(geomID); + if (nullptr == mesh) { + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"geomID does not return correct type"); + return; + } + + __internal_two_level_builder__::MeshBuilder()(accel, mesh, geomID, this->gtype, this->useMortonBuilder_, builder); + } + + using BuilderList = std::vector>; + + BuilderList builders; + BVH* bvh; + Scene* scene; + mvector refs; + mvector prims; + std::atomic nextRef; + const size_t singleThreadThreshold; + Geometry::GTypeMask gtype; + bool useMortonBuilder_ = false; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h new file mode 100644 index 00000000000..1c1ae8d6a72 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h @@ -0,0 +1,267 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" + +namespace embree +{ + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t) + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t) + + namespace isa + { + + namespace __internal_two_level_builder__ { + + template + struct MortonBuilder {}; + template<> + struct MortonBuilder<4,TriangleMesh,Triangle4> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,TriangleMesh,Triangle4v> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,TriangleMesh,Triangle4i> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,QuadMesh,Quad4v> { + MortonBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,UserGeometry,Object> { + MortonBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,Instance,InstancePrimitive> { + MortonBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);} + }; + template<> + struct MortonBuilder<8,TriangleMesh,Triangle4> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,TriangleMesh,Triangle4v> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,TriangleMesh,Triangle4i> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,QuadMesh,Quad4v> { + MortonBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,UserGeometry,Object> { + MortonBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,Instance,InstancePrimitive> { + MortonBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);} + }; + + template + struct SAHBuilder {}; + template<> + struct SAHBuilder<4,TriangleMesh,Triangle4> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,TriangleMesh,Triangle4v> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,TriangleMesh,Triangle4i> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,QuadMesh,Quad4v> { + SAHBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,UserGeometry,Object> { + SAHBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,Instance,InstancePrimitive> { + SAHBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);} + }; + template<> + struct SAHBuilder<8,TriangleMesh,Triangle4> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,TriangleMesh,Triangle4v> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,TriangleMesh,Triangle4i> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,QuadMesh,Quad4v> { + SAHBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,UserGeometry,Object> { + SAHBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,Instance,InstancePrimitive> { + SAHBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);} + }; + + template + struct RefitBuilder {}; + template<> + struct RefitBuilder<4,TriangleMesh,Triangle4> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,TriangleMesh,Triangle4v> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,TriangleMesh,Triangle4i> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,QuadMesh,Quad4v> { + RefitBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,UserGeometry,Object> { + RefitBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,Instance,InstancePrimitive> { + RefitBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);} + }; + template<> + struct RefitBuilder<8,TriangleMesh,Triangle4> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,TriangleMesh,Triangle4v> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,TriangleMesh,Triangle4i> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,QuadMesh,Quad4v> { + RefitBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,UserGeometry,Object> { + RefitBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,Instance,InstancePrimitive> { + RefitBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);} + }; + + template + struct MeshBuilder { + MeshBuilder () {} + void operator () (void* bvh, Mesh* mesh, size_t geomID, Geometry::GTypeMask gtype, bool useMortonBuilder, Builder*& builder) { + if(useMortonBuilder) { + builder = MortonBuilder()(bvh,mesh,geomID,gtype); + return; + } + switch (mesh->quality) { + case RTC_BUILD_QUALITY_LOW: builder = MortonBuilder()(bvh,mesh,geomID,gtype); break; + case RTC_BUILD_QUALITY_MEDIUM: + case RTC_BUILD_QUALITY_HIGH: builder = SAHBuilder()(bvh,mesh,geomID,gtype); break; + case RTC_BUILD_QUALITY_REFIT: builder = RefitBuilder()(bvh,mesh,geomID,gtype); break; + default: throw_RTCError(RTC_ERROR_UNKNOWN,"invalid build quality"); + } + } + }; + } + } +} \ No newline at end of file diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp new file mode 100644 index 00000000000..a27be8bae83 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp @@ -0,0 +1,375 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_collider.h" +#include "../geometry/triangle_triangle_intersector.h" + +namespace embree +{ + namespace isa + { +#define CSTAT(x) + + size_t parallel_depth_threshold = 3; + CSTAT(std::atomic bvh_collide_traversal_steps(0)); + CSTAT(std::atomic bvh_collide_leaf_pairs(0)); + CSTAT(std::atomic bvh_collide_leaf_iterations(0)); + CSTAT(std::atomic bvh_collide_prim_intersections1(0)); + CSTAT(std::atomic bvh_collide_prim_intersections2(0)); + CSTAT(std::atomic bvh_collide_prim_intersections3(0)); + CSTAT(std::atomic bvh_collide_prim_intersections4(0)); + CSTAT(std::atomic bvh_collide_prim_intersections5(0)); + CSTAT(std::atomic bvh_collide_prim_intersections(0)); + + struct Collision + { + __forceinline Collision() {} + + __forceinline Collision (unsigned geomID0, unsigned primID0, unsigned geomID1, unsigned primID1) + : geomID0(geomID0), primID0(primID0), geomID1(geomID1), primID1(primID1) {} + + unsigned geomID0; + unsigned primID0; + unsigned geomID1; + unsigned primID1; + }; + + template + __forceinline size_t overlap(const BBox3fa& box0, const typename BVHN::AABBNode& node1) + { + const vfloat lower_x = max(vfloat(box0.lower.x),node1.lower_x); + const vfloat lower_y = max(vfloat(box0.lower.y),node1.lower_y); + const vfloat lower_z = max(vfloat(box0.lower.z),node1.lower_z); + const vfloat upper_x = min(vfloat(box0.upper.x),node1.upper_x); + const vfloat upper_y = min(vfloat(box0.upper.y),node1.upper_y); + const vfloat upper_z = min(vfloat(box0.upper.z),node1.upper_z); + return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z)); + } + + template + __forceinline size_t overlap(const BBox3fa& box0, const BBox>>& box1) + { + const vfloat lower_x = max(vfloat(box0.lower.x),box1.lower.x); + const vfloat lower_y = max(vfloat(box0.lower.y),box1.lower.y); + const vfloat lower_z = max(vfloat(box0.lower.z),box1.lower.z); + const vfloat upper_x = min(vfloat(box0.upper.x),box1.upper.x); + const vfloat upper_y = min(vfloat(box0.upper.y),box1.upper.y); + const vfloat upper_z = min(vfloat(box0.upper.z),box1.upper.z); + return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z)); + } + + template + __forceinline size_t overlap(const BBox>>& box0, size_t i, const BBox>>& box1) + { + const vfloat lower_x = max(vfloat(box0.lower.x[i]),box1.lower.x); + const vfloat lower_y = max(vfloat(box0.lower.y[i]),box1.lower.y); + const vfloat lower_z = max(vfloat(box0.lower.z[i]),box1.lower.z); + const vfloat upper_x = min(vfloat(box0.upper.x[i]),box1.upper.x); + const vfloat upper_y = min(vfloat(box0.upper.y[i]),box1.upper.y); + const vfloat upper_z = min(vfloat(box0.upper.z[i]),box1.upper.z); + return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z)); + } + + bool intersect_triangle_triangle (Scene* scene0, unsigned geomID0, unsigned primID0, Scene* scene1, unsigned geomID1, unsigned primID1) + { + CSTAT(bvh_collide_prim_intersections1++); + const TriangleMesh* mesh0 = scene0->get(geomID0); + const TriangleMesh* mesh1 = scene1->get(geomID1); + const TriangleMesh::Triangle& tri0 = mesh0->triangle(primID0); + const TriangleMesh::Triangle& tri1 = mesh1->triangle(primID1); + + /* special culling for scene intersection with itself */ + if (scene0 == scene1 && geomID0 == geomID1) + { + /* ignore self intersections */ + if (primID0 == primID1) + return false; + } + CSTAT(bvh_collide_prim_intersections2++); + + if (scene0 == scene1 && geomID0 == geomID1) + { + /* ignore intersection with topological neighbors */ + const vint4 t0(tri0.v[0],tri0.v[1],tri0.v[2],tri0.v[2]); + if (any(vint4(tri1.v[0]) == t0)) return false; + if (any(vint4(tri1.v[1]) == t0)) return false; + if (any(vint4(tri1.v[2]) == t0)) return false; + } + CSTAT(bvh_collide_prim_intersections3++); + + const Vec3fa a0 = mesh0->vertex(tri0.v[0]); + const Vec3fa a1 = mesh0->vertex(tri0.v[1]); + const Vec3fa a2 = mesh0->vertex(tri0.v[2]); + const Vec3fa b0 = mesh1->vertex(tri1.v[0]); + const Vec3fa b1 = mesh1->vertex(tri1.v[1]); + const Vec3fa b2 = mesh1->vertex(tri1.v[2]); + + return TriangleTriangleIntersector::intersect_triangle_triangle(a0,a1,a2,b0,b1,b2); + } + + template + __forceinline void BVHNColliderUserGeom::processLeaf(NodeRef node0, NodeRef node1) + { + Collision collisions[16]; + size_t num_collisions = 0; + + size_t N0; Object* leaf0 = (Object*) node0.leaf(N0); + size_t N1; Object* leaf1 = (Object*) node1.leaf(N1); + for (size_t i=0; iscene0 == this->scene1 && geomID0 == geomID1 && primID0 == primID1) continue; + collisions[num_collisions++] = Collision(geomID0,primID0,geomID1,primID1); + if (num_collisions == 16) { + this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions); + num_collisions = 0; + } + } + } + if (num_collisions) + this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions); + } + + template + void BVHNCollider::collide_recurse(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1, size_t depth0, size_t depth1) + { + CSTAT(bvh_collide_traversal_steps++); + if (unlikely(ref0.isLeaf())) { + if (unlikely(ref1.isLeaf())) { + CSTAT(bvh_collide_leaf_pairs++); + processLeaf(ref0,ref1); + return; + } else goto recurse_node1; + + } else { + if (unlikely(ref1.isLeaf())) { + goto recurse_node0; + } else { + if (area(bounds0) > area(bounds1)) { + goto recurse_node0; + } + else { + goto recurse_node1; + } + } + } + + { + recurse_node0: + AABBNode* node0 = ref0.getAABBNode(); + size_t mask = overlap(bounds1,*node0); + //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + //for (size_t i=0; i::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE); + collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1); + } + }); + } + else +#endif + { + for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + BVHN::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE); + collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1); + } + } + return; + } + + { + recurse_node1: + AABBNode* node1 = ref1.getAABBNode(); + size_t mask = overlap(bounds0,*node1); + //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + //for (size_t i=0; i::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE); + collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1); + } + }); + } + else +#endif + { + for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + BVHN::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE); + collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1); + } + } + return; + } + } + + template + void BVHNCollider::split(const CollideJob& job, jobvector& jobs) + { + if (unlikely(job.ref0.isLeaf())) { + if (unlikely(job.ref1.isLeaf())) { + jobs.push_back(job); + return; + } else goto recurse_node1; + } else { + if (unlikely(job.ref1.isLeaf())) { + goto recurse_node0; + } else { + if (area(job.bounds0) > area(job.bounds1)) { + goto recurse_node0; + } + else { + goto recurse_node1; + } + } + } + + { + recurse_node0: + const AABBNode* node0 = job.ref0.getAABBNode(); + size_t mask = overlap(job.bounds1,*node0); + for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + jobs.push_back(CollideJob(node0->child(i),node0->bounds(i),job.depth0+1,job.ref1,job.bounds1,job.depth1)); + } + return; + } + + { + recurse_node1: + const AABBNode* node1 = job.ref1.getAABBNode(); + size_t mask = overlap(job.bounds0,*node1); + for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + jobs.push_back(CollideJob(job.ref0,job.bounds0,job.depth0,node1->child(i),node1->bounds(i),job.depth1+1)); + } + return; + } + } + + template + void BVHNCollider::collide_recurse_entry(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1) + { + CSTAT(bvh_collide_traversal_steps = 0); + CSTAT(bvh_collide_leaf_pairs = 0); + CSTAT(bvh_collide_leaf_iterations = 0); + CSTAT(bvh_collide_prim_intersections1 = 0); + CSTAT(bvh_collide_prim_intersections2 = 0); + CSTAT(bvh_collide_prim_intersections3 = 0); + CSTAT(bvh_collide_prim_intersections4 = 0); + CSTAT(bvh_collide_prim_intersections5 = 0); + CSTAT(bvh_collide_prim_intersections = 0); +#if 0 + collide_recurse(ref0,bounds0,ref1,bounds1,0,0); +#else + const int M = 2048; + jobvector jobs[2]; + jobs[0].reserve(M); + jobs[1].reserve(M); + jobs[0].push_back(CollideJob(ref0,bounds0,0,ref1,bounds1,0)); + int source = 0; + int target = 1; + + /* try to split job until job list is full */ + while (jobs[source].size()+8 <= M) + { + for (size_t i=0; i M) { + jobs[target].push_back(job); + } else { + split(job,jobs[target]); + } + } + + /* stop splitting jobs if we reached only leaves and cannot make progress anymore */ + if (jobs[target].size() == jobs[source].size()) + break; + + jobs[source].resize(0); + std::swap(source,target); + } + + /* parallel processing of all jobs */ + parallel_for(size_t(jobs[source].size()), [&] ( size_t i ) { + CollideJob& j = jobs[source][i]; + collide_recurse(j.ref0,j.bounds0,j.ref1,j.bounds1,j.depth0,j.depth1); + }); + + +#endif + CSTAT(PRINT(bvh_collide_traversal_steps)); + CSTAT(PRINT(bvh_collide_leaf_pairs)); + CSTAT(PRINT(bvh_collide_leaf_iterations)); + CSTAT(PRINT(bvh_collide_prim_intersections1)); + CSTAT(PRINT(bvh_collide_prim_intersections2)); + CSTAT(PRINT(bvh_collide_prim_intersections3)); + CSTAT(PRINT(bvh_collide_prim_intersections4)); + CSTAT(PRINT(bvh_collide_prim_intersections5)); + CSTAT(PRINT(bvh_collide_prim_intersections)); + } + + template + void BVHNColliderUserGeom::collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr) + { + BVHNColliderUserGeom(bvh0->scene,bvh1->scene,callback,userPtr). + collide_recurse_entry(bvh0->root,bvh0->bounds.bounds(),bvh1->root,bvh1->bounds.bounds()); + } + +#if defined (EMBREE_LOWEST_ISA) + struct collision_regression_test : public RegressionTest + { + collision_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(-0.008815f, 0.041848f, -2.49875e-06f), Vec3fa(-0.008276f, 0.053318f, -2.49875e-06f), Vec3fa(0.003023f, 0.048969f, -2.49875e-06f), + Vec3fa(0.00245f, 0.037612f, -2.49875e-06f), Vec3fa(0.01434f, 0.042634f, -2.49875e-06f), Vec3fa(0.013499f, 0.031309f, -2.49875e-06f)) == false; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,1),Vec3fa(0,1,1)) == false; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,-0.1f),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,-0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(-0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), + Vec3fa(-1,1,0) + Vec3fa(0,0,0),Vec3fa(-1,1,0) + Vec3fa(0.1f,0,0),Vec3fa(-1,1,0) + Vec3fa(0,0.1f,0)) == false; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), + Vec3fa( 2,0.5f,0) + Vec3fa(0,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0.1f,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0,0.1f,0)) == false; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), + Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0.1f,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0.1f,0)) == false; + return passed; + } + }; + + collision_regression_test collision_regression("collision_regression_test"); +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Collider Definitions + //////////////////////////////////////////////////////////////////////////////// + + DEFINE_COLLIDER(BVH4ColliderUserGeom,BVHNColliderUserGeom<4>); + +#if defined(__AVX__) + DEFINE_COLLIDER(BVH8ColliderUserGeom,BVHNColliderUserGeom<8>); +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h new file mode 100644 index 00000000000..ac4f99c96a7 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h @@ -0,0 +1,72 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../geometry/trianglev.h" +#include "../geometry/object.h" + +namespace embree +{ + namespace isa + { + template + class BVHNCollider + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::AABBNode AABBNode; + + struct CollideJob + { + CollideJob () {} + + CollideJob (NodeRef ref0, const BBox3fa& bounds0, size_t depth0, + NodeRef ref1, const BBox3fa& bounds1, size_t depth1) + : ref0(ref0), bounds0(bounds0), depth0(depth0), ref1(ref1), bounds1(bounds1), depth1(depth1) {} + + NodeRef ref0; + BBox3fa bounds0; + size_t depth0; + NodeRef ref1; + BBox3fa bounds1; + size_t depth1; + }; + + typedef vector_t> jobvector; + + void split(const CollideJob& job, jobvector& jobs); + + public: + __forceinline BVHNCollider (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr) + : scene0(scene0), scene1(scene1), callback(callback), userPtr(userPtr) {} + + public: + virtual void processLeaf(NodeRef leaf0, NodeRef leaf1) = 0; + void collide_recurse(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1, size_t depth0, size_t depth1); + void collide_recurse_entry(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1); + + protected: + Scene* scene0; + Scene* scene1; + RTCCollideFunc callback; + void* userPtr; + }; + + template + class BVHNColliderUserGeom : public BVHNCollider + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::AABBNode AABBNode; + + __forceinline BVHNColliderUserGeom (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr) + : BVHNCollider(scene0,scene1,callback,userPtr) {} + + virtual void processLeaf(NodeRef leaf0, NodeRef leaf1); + public: + static void collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr); + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h new file mode 100644 index 00000000000..54021ca6eb9 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h @@ -0,0 +1,21 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../bvh/bvh.h" +#include "../common/isa.h" +#include "../common/accel.h" +#include "../common/scene.h" +#include "../geometry/curve_intersector_virtual.h" + +namespace embree +{ + /*! BVH instantiations */ + class BVHFactory + { + public: + enum class BuildVariant { STATIC, DYNAMIC, HIGH_QUALITY }; + enum class IntersectVariant { FAST, ROBUST }; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp new file mode 100644 index 00000000000..ea6adc27174 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp @@ -0,0 +1,330 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector1.h" +#include "node_intersector1.h" +#include "bvh_traverser1.h" + +#include "../geometry/intersector_iterators.h" +#include "../geometry/triangle_intersector.h" +#include "../geometry/trianglev_intersector.h" +#include "../geometry/trianglev_mb_intersector.h" +#include "../geometry/trianglei_intersector.h" +#include "../geometry/quadv_intersector.h" +#include "../geometry/quadi_intersector.h" +#include "../geometry/curveNv_intersector.h" +#include "../geometry/curveNi_intersector.h" +#include "../geometry/curveNi_mb_intersector.h" +#include "../geometry/linei_intersector.h" +#include "../geometry/subdivpatch1_intersector.h" +#include "../geometry/object_intersector.h" +#include "../geometry/instance_intersector.h" +#include "../geometry/subgrid_intersector.h" +#include "../geometry/subgrid_mb_intersector.h" +#include "../geometry/curve_intersector_virtual.h" + +namespace embree +{ + namespace isa + { + template + void BVHNIntersector1::intersect(const Accel::Intersectors* __restrict__ This, + RayHit& __restrict__ ray, + IntersectContext* __restrict__ context) + { + const BVH* __restrict__ bvh = (const BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return; + + /* perform per ray precalculations required by the primitive intersector */ + Precalculations pre(ray, bvh); + + /* stack state */ + StackItemT stack[stackSize]; // stack of nodes + StackItemT* stackPtr = stack+1; // current stack pointer + StackItemT* stackEnd = stack+stackSize; + stack[0].ptr = bvh->root; + stack[0].dist = neg_inf; + + if (bvh->root == BVH::emptyNode) + return; + + /* filter out invalid rays */ +#if defined(EMBREE_IGNORE_INVALID_RAYS) + if (!ray.valid()) return; +#endif + /* verify correct input */ + assert(ray.valid()); + assert(ray.tnear() >= 0.0f); + assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f)); + + /* load the ray into SIMD registers */ + TravRay tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f)); + + /* initialize the node traverser */ + BVHNNodeTraverser1Hit nodeTraverser; + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* if popped node is too far, pop next one */ +#if defined(__AVX512ER__) + /* much faster on KNL */ + if (unlikely(any(vfloat(*(float*)&stackPtr->dist) > tray.tfar))) + continue; +#else + if (unlikely(*(float*)&stackPtr->dist > ray.tfar)) + continue; +#endif + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat tNear; + STAT3(normal.trav_nodes,1,1,1); + bool nodeIntersected = BVHNNodeIntersector1::intersect(cur, tray, ray.time(), tNear, mask); + if (unlikely(!nodeIntersected)) { STAT3(normal.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(normal.trav_leaves,1,1,1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + size_t lazy_node = 0; + PrimitiveIntersector1::intersect(This, pre, ray, context, prim, num, tray, lazy_node); + tray.tfar = ray.tfar; + + /* push lazy node onto stack */ + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->dist = neg_inf; + stackPtr++; + } + } + } + + template + void BVHNIntersector1::occluded(const Accel::Intersectors* __restrict__ This, + Ray& __restrict__ ray, + IntersectContext* __restrict__ context) + { + const BVH* __restrict__ bvh = (const BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return; + + /* early out for already occluded rays */ + if (unlikely(ray.tfar < 0.0f)) + return; + + /* perform per ray precalculations required by the primitive intersector */ + Precalculations pre(ray, bvh); + + /* stack state */ + NodeRef stack[stackSize]; // stack of nodes that still need to get traversed + NodeRef* stackPtr = stack+1; // current stack pointer + NodeRef* stackEnd = stack+stackSize; + stack[0] = bvh->root; + + /* filter out invalid rays */ +#if defined(EMBREE_IGNORE_INVALID_RAYS) + if (!ray.valid()) return; +#endif + + /* verify correct input */ + assert(ray.valid()); + assert(ray.tnear() >= 0.0f); + assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f)); + + /* load the ray into SIMD registers */ + TravRay tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f)); + + /* initialize the node traverser */ + BVHNNodeTraverser1Hit nodeTraverser; + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = (NodeRef)*stackPtr; + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat tNear; + STAT3(shadow.trav_nodes,1,1,1); + bool nodeIntersected = BVHNNodeIntersector1::intersect(cur, tray, ray.time(), tNear, mask); + if (unlikely(!nodeIntersected)) { STAT3(shadow.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + nodeTraverser.traverseAnyHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(shadow.trav_leaves,1,1,1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + size_t lazy_node = 0; + if (PrimitiveIntersector1::occluded(This, pre, ray, context, prim, num, tray, lazy_node)) { + ray.tfar = neg_inf; + break; + } + + /* push lazy node onto stack */ + if (unlikely(lazy_node)) { + *stackPtr = (NodeRef)lazy_node; + stackPtr++; + } + } + } + + template + struct PointQueryDispatch + { + typedef typename PrimitiveIntersector1::Precalculations Precalculations; + typedef typename PrimitiveIntersector1::Primitive Primitive; + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::AABBNodeMB4D AABBNodeMB4D; + + static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store + + /* right now AVX512KNL SIMD extension only for standard node types */ + static const size_t Nx = (types == BVH_AN1 || types == BVH_QN1) ? vextend::size : N; + + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) + { + const BVH* __restrict__ bvh = (const BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return false; + + /* stack state */ + StackItemT stack[stackSize]; // stack of nodes + StackItemT* stackPtr = stack+1; // current stack pointer + StackItemT* stackEnd = stack+stackSize; + stack[0].ptr = bvh->root; + stack[0].dist = neg_inf; + + /* verify correct input */ + assert(!(types & BVH_MB) || (query->time >= 0.0f && query->time <= 1.0f)); + + /* load the point query into SIMD registers */ + TravPointQuery tquery(query->p, context->query_radius); + + /* initialize the node traverser */ + BVHNNodeTraverser1Hit nodeTraverser; + + bool changed = false; + float cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE + ? query->radius * query->radius + : dot(context->query_radius, context->query_radius); + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* if popped node is too far, pop next one */ + if (unlikely(*(float*)&stackPtr->dist > cull_radius)) + continue; + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat tNear; + STAT3(point_query.trav_nodes,1,1,1); + bool nodeIntersected; + if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) { + nodeIntersected = BVHNNodePointQuerySphere1::pointQuery(cur, tquery, query->time, tNear, mask); + } else { + nodeIntersected = BVHNNodePointQueryAABB1 ::pointQuery(cur, tquery, query->time, tNear, mask); + } + if (unlikely(!nodeIntersected)) { STAT3(point_query.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(point_query.trav_leaves,1,1,1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + size_t lazy_node = 0; + if (PrimitiveIntersector1::pointQuery(This, query, context, prim, num, tquery, lazy_node)) + { + changed = true; + tquery.rad = context->query_radius; + cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE + ? query->radius * query->radius + : dot(context->query_radius, context->query_radius); + } + + /* push lazy node onto stack */ + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->dist = neg_inf; + stackPtr++; + } + } + return changed; + } + }; + + /* disable point queries for not yet supported geometry types */ + template + struct PointQueryDispatch { + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; } + }; + + template + struct PointQueryDispatch { + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; } + }; + + template + struct PointQueryDispatch { + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; } + }; + + template + bool BVHNIntersector1::pointQuery( + const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) + { + return PointQueryDispatch::pointQuery(This, query, context); + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h new file mode 100644 index 00000000000..1a269c319a7 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h @@ -0,0 +1,37 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../common/ray.h" +#include "../common/point_query.h" + +namespace embree +{ + namespace isa + { + /*! BVH single ray intersector. */ + template + class BVHNIntersector1 + { + /* shortcuts for frequently used types */ + typedef typename PrimitiveIntersector1::Precalculations Precalculations; + typedef typename PrimitiveIntersector1::Primitive Primitive; + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::AABBNodeMB4D AABBNodeMB4D; + + static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store + + /* right now AVX512KNL SIMD extension only for standard node types */ + static const size_t Nx = (types == BVH_AN1 || types == BVH_QN1) ? vextend::size : N; + + public: + static void intersect (const Accel::Intersectors* This, RayHit& ray, IntersectContext* context); + static void occluded (const Accel::Intersectors* This, Ray& ray, IntersectContext* context); + static bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context); + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp new file mode 100644 index 00000000000..989f7354fda --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp @@ -0,0 +1,61 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector1.cpp" + +namespace embree +{ + namespace isa + { + int getISA() { + return VerifyMultiTargetLinking::getISA(); + } + + //////////////////////////////////////////////////////////////////////////////// + /// BVH4Intersector1 Definitions + //////////////////////////////////////////////////////////////////////////////// + + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA false COMMA VirtualCurveIntersector1 >)); + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA false COMMA VirtualCurveIntersector1 >)); + + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA true COMMA VirtualCurveIntersector1 >)); + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA true COMMA VirtualCurveIntersector1 >)); + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4Intersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1 > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1 > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1 > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1 > >)); + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1 > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1 > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1 > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1 > >)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1 > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1 > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1 > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1 > >)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1 > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1 > >)); + + IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1Intersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector1>)); + IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1MBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubdivPatch1MBIntersector1>)); + + IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1> >)); + IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1> >)); + + IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1 >)); + IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1 >)); + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(QBVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1 > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(QBVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1 > >)); + + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersector1Moeller<4 COMMA true> >)); + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >)); + + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubGridIntersector1Pluecker<4 COMMA true> >)); + //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >)); + + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h new file mode 100644 index 00000000000..d764cc928d7 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h @@ -0,0 +1,61 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../common/ray.h" +#include "../common/stack_item.h" +#include "node_intersector_frustum.h" + +namespace embree +{ + namespace isa + { + template + struct TravRayK; + + /*! BVH hybrid packet intersector. Switches between packet and single ray traversal (optional). */ + template + class BVHNIntersectorKHybrid + { + /* right now AVX512KNL SIMD extension only for standard node types */ + static const size_t Nx = types == BVH_AN1 ? vextend::size : N; + + /* shortcuts for frequently used types */ + typedef typename PrimitiveIntersectorK::Precalculations Precalculations; + typedef typename PrimitiveIntersectorK::Primitive Primitive; + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::BaseNode BaseNode; + typedef typename BVH::AABBNode AABBNode; + + static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store + static const size_t stackSizeChunk = 1+(N-1)*BVH::maxDepth; + + static const size_t switchThresholdIncoherent = \ + (K==4) ? 3 : + (K==8) ? ((N==4) ? 5 : 7) : + (K==16) ? 14 : // 14 seems to work best for KNL due to better ordered chunk traversal + 0; + + private: + static void intersect1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre, + RayHitK& ray, const TravRayK& tray, IntersectContext* context); + static bool occluded1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre, + RayK& ray, const TravRayK& tray, IntersectContext* context); + + public: + static void intersect(vint* valid, Accel::Intersectors* This, RayHitK& ray, IntersectContext* context); + static void occluded (vint* valid, Accel::Intersectors* This, RayK& ray, IntersectContext* context); + + static void intersectCoherent(vint* valid, Accel::Intersectors* This, RayHitK& ray, IntersectContext* context); + static void occludedCoherent (vint* valid, Accel::Intersectors* This, RayK& ray, IntersectContext* context); + + }; + + /*! BVH packet intersector. */ + template + class BVHNIntersectorKChunk : public BVHNIntersectorKHybrid {}; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h new file mode 100644 index 00000000000..83d1fb4d3df --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h @@ -0,0 +1,295 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector_packet_stream.h" +#include "node_intersector_frustum.h" +#include "bvh_traverser_stream.h" + +namespace embree +{ + namespace isa + { + /*! BVH ray stream intersector. */ + template + class BVHNIntersectorStream + { + static const int Nxd = (Nx == N) ? N : Nx/2; + + /* shortcuts for frequently used types */ + template using PrimitiveIntersectorK = typename PrimitiveIntersector::template Type; + template using PrimitiveK = typename PrimitiveIntersectorK::PrimitiveK; + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::BaseNode BaseNode; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::AABBNodeMB AABBNodeMB; + + template + __forceinline static size_t initPacketsAndFrustum(RayK** inputPackets, size_t numOctantRays, + TravRayKStream* packets, Frustum& frustum, bool& commonOctant) + { + const size_t numPackets = (numOctantRays+K-1)/K; + + Vec3vf tmp_min_rdir(pos_inf); + Vec3vf tmp_max_rdir(neg_inf); + Vec3vf tmp_min_org(pos_inf); + Vec3vf tmp_max_org(neg_inf); + vfloat tmp_min_dist(pos_inf); + vfloat tmp_max_dist(neg_inf); + + size_t m_active = 0; + for (size_t i = 0; i < numPackets; i++) + { + const vfloat tnear = inputPackets[i]->tnear(); + const vfloat tfar = inputPackets[i]->tfar; + vbool m_valid = (tnear <= tfar) & (tnear >= 0.0f); + +#if defined(EMBREE_IGNORE_INVALID_RAYS) + m_valid &= inputPackets[i]->valid(); +#endif + + m_active |= (size_t)movemask(m_valid) << (i*K); + + vfloat packet_min_dist = max(tnear, 0.0f); + vfloat packet_max_dist = select(m_valid, tfar, neg_inf); + tmp_min_dist = min(tmp_min_dist, packet_min_dist); + tmp_max_dist = max(tmp_max_dist, packet_max_dist); + + const Vec3vf& org = inputPackets[i]->org; + const Vec3vf& dir = inputPackets[i]->dir; + + new (&packets[i]) TravRayKStream(org, dir, packet_min_dist, packet_max_dist); + + tmp_min_rdir = min(tmp_min_rdir, select(m_valid, packets[i].rdir, Vec3vf(pos_inf))); + tmp_max_rdir = max(tmp_max_rdir, select(m_valid, packets[i].rdir, Vec3vf(neg_inf))); + tmp_min_org = min(tmp_min_org , select(m_valid,org , Vec3vf(pos_inf))); + tmp_max_org = max(tmp_max_org , select(m_valid,org , Vec3vf(neg_inf))); + } + + m_active &= (numOctantRays == (8 * sizeof(size_t))) ? (size_t)-1 : (((size_t)1 << numOctantRays)-1); + + + const Vec3fa reduced_min_rdir(reduce_min(tmp_min_rdir.x), + reduce_min(tmp_min_rdir.y), + reduce_min(tmp_min_rdir.z)); + + const Vec3fa reduced_max_rdir(reduce_max(tmp_max_rdir.x), + reduce_max(tmp_max_rdir.y), + reduce_max(tmp_max_rdir.z)); + + const Vec3fa reduced_min_origin(reduce_min(tmp_min_org.x), + reduce_min(tmp_min_org.y), + reduce_min(tmp_min_org.z)); + + const Vec3fa reduced_max_origin(reduce_max(tmp_max_org.x), + reduce_max(tmp_max_org.y), + reduce_max(tmp_max_org.z)); + + commonOctant = + (reduced_max_rdir.x < 0.0f || reduced_min_rdir.x >= 0.0f) && + (reduced_max_rdir.y < 0.0f || reduced_min_rdir.y >= 0.0f) && + (reduced_max_rdir.z < 0.0f || reduced_min_rdir.z >= 0.0f); + + const float frustum_min_dist = reduce_min(tmp_min_dist); + const float frustum_max_dist = reduce_max(tmp_max_dist); + + frustum.init(reduced_min_origin, reduced_max_origin, + reduced_min_rdir, reduced_max_rdir, + frustum_min_dist, frustum_max_dist, + N); + + return m_active; + } + + template + __forceinline static size_t intersectAABBNodePacket(size_t m_active, + const TravRayKStream* packets, + const AABBNode* __restrict__ node, + size_t boxID, + const NearFarPrecalculations& nf) + { + assert(m_active); + const size_t startPacketID = bsf(m_active) / K; + const size_t endPacketID = bsr(m_active) / K; + size_t m_trav_active = 0; + for (size_t i = startPacketID; i <= endPacketID; i++) + { + const size_t m_hit = intersectNodeK(node, boxID, packets[i], nf); + m_trav_active |= m_hit << (i*K); + } + return m_trav_active; + } + + template + __forceinline static size_t traverseCoherentStream(size_t m_active, + TravRayKStream* packets, + const AABBNode* __restrict__ node, + const Frustum& frustum, + size_t* maskK, + vfloat& dist) + { + size_t m_node_hit = intersectNodeFrustum(node, frustum, dist); + const size_t first_index = bsf(m_active); + const size_t first_packetID = first_index / K; + const size_t first_rayID = first_index % K; + size_t m_first_hit = intersectNode1(node, packets[first_packetID], first_rayID, frustum.nf); + + /* this make traversal independent of the ordering of rays */ + size_t m_node = m_node_hit ^ m_first_hit; + while (unlikely(m_node)) + { + const size_t boxID = bscf(m_node); + const size_t m_current = m_active & intersectAABBNodePacket(m_active, packets, node, boxID, frustum.nf); + m_node_hit ^= m_current ? (size_t)0 : ((size_t)1 << boxID); + maskK[boxID] = m_current; + } + return m_node_hit; + } + + // TODO: explicit 16-wide path for KNL + template + __forceinline static vint traverseIncoherentStream(size_t m_active, + TravRayKStreamFast* __restrict__ packets, + const AABBNode* __restrict__ node, + const NearFarPrecalculations& nf, + const int shiftTable[32]) + { + const vfloat bminX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearX)); + const vfloat bminY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearY)); + const vfloat bminZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearZ)); + const vfloat bmaxX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farX)); + const vfloat bmaxY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farY)); + const vfloat bmaxZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farZ)); + assert(m_active); + vint vmask(zero); + do + { + STAT3(shadow.trav_nodes,1,1,1); + const size_t rayID = bscf(m_active); + assert(rayID < MAX_INTERNAL_STREAM_SIZE); + TravRayKStream &p = packets[rayID / K]; + const size_t i = rayID % K; + const vint bitmask(shiftTable[rayID]); + +#if defined (__aarch64__) + const vfloat tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]); + const vfloat tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]); + const vfloat tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]); + const vfloat tFarX = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]); + const vfloat tFarY = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]); + const vfloat tFarZ = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]); +#else + const vfloat tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]); + const vfloat tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]); + const vfloat tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]); + const vfloat tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]); + const vfloat tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]); + const vfloat tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); +#endif + + const vfloat tNear = maxi(tNearX, tNearY, tNearZ, vfloat(p.tnear[i])); + const vfloat tFar = mini(tFarX , tFarY , tFarZ, vfloat(p.tfar[i])); + +#if defined(__AVX512ER__) + const vboolx m_node((1 << N)-1); + const vbool hit_mask = le(m_node, tNear, tFar); + vmask = mask_or(hit_mask, vmask, vmask, bitmask); +#else + const vbool hit_mask = tNear <= tFar; +#if defined(__AVX2__) + vmask = vmask | (bitmask & vint(hit_mask)); +#else + vmask = select(hit_mask, vmask | bitmask, vmask); +#endif +#endif + } while(m_active); + return vmask; + } + + template + __forceinline static vint traverseIncoherentStream(size_t m_active, + TravRayKStreamRobust* __restrict__ packets, + const AABBNode* __restrict__ node, + const NearFarPrecalculations& nf, + const int shiftTable[32]) + { + const vfloat bminX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearX)); + const vfloat bminY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearY)); + const vfloat bminZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearZ)); + const vfloat bmaxX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farX)); + const vfloat bmaxY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farY)); + const vfloat bmaxZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farZ)); + assert(m_active); + vint vmask(zero); + do + { + STAT3(shadow.trav_nodes,1,1,1); + const size_t rayID = bscf(m_active); + assert(rayID < MAX_INTERNAL_STREAM_SIZE); + TravRayKStream &p = packets[rayID / K]; + const size_t i = rayID % K; + const vint bitmask(shiftTable[rayID]); + const vfloat tNearX = (bminX - p.org.x[i]) * p.rdir.x[i]; + const vfloat tNearY = (bminY - p.org.y[i]) * p.rdir.y[i]; + const vfloat tNearZ = (bminZ - p.org.z[i]) * p.rdir.z[i]; + const vfloat tFarX = (bmaxX - p.org.x[i]) * p.rdir.x[i]; + const vfloat tFarY = (bmaxY - p.org.y[i]) * p.rdir.y[i]; + const vfloat tFarZ = (bmaxZ - p.org.z[i]) * p.rdir.z[i]; + const vfloat tNear = maxi(tNearX, tNearY, tNearZ, vfloat(p.tnear[i])); + const vfloat tFar = mini(tFarX , tFarY , tFarZ, vfloat(p.tfar[i])); + const float round_down = 1.0f-2.0f*float(ulp); + const float round_up = 1.0f+2.0f*float(ulp); +#if defined(__AVX512ER__) + const vboolx m_node((1 << N)-1); + const vbool hit_mask = le(m_node, round_down*tNear, round_up*tFar); + vmask = mask_or(hit_mask, vmask, vmask, bitmask); +#else + const vbool hit_mask = round_down*tNear <= round_up*tFar; +#if defined(__AVX2__) + vmask = vmask | (bitmask & vint(hit_mask)); +#else + vmask = select(hit_mask, vmask | bitmask, vmask); +#endif +#endif + } while(m_active); + return vmask; + } + + + static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth; + + public: + static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context); + static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context); + + private: + template + static void intersectCoherent(Accel::Intersectors* This, RayHitK** inputRays, size_t numRays, IntersectContext* context); + + template + static void occludedCoherent(Accel::Intersectors* This, RayK** inputRays, size_t numRays, IntersectContext* context); + + template + static void occludedIncoherent(Accel::Intersectors* This, RayK** inputRays, size_t numRays, IntersectContext* context); + }; + + + /*! BVH ray stream intersector with direct fallback to packets. */ + template + class BVHNIntersectorStreamPacketFallback + { + public: + static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context); + static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context); + + private: + template + static void intersectK(Accel::Intersectors* This, RayHitK** inputRays, size_t numRays, IntersectContext* context); + + template + static void occludedK(Accel::Intersectors* This, RayK** inputRays, size_t numRays, IntersectContext* context); + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h new file mode 100644 index 00000000000..cdeb9236370 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h @@ -0,0 +1,41 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "../common/ray.h" +#include "../common/scene.h" + +namespace embree +{ + namespace isa + { + class RayStreamFilter + { + public: + static void intersectAOS(Scene* scene, RTCRayHit* rays, size_t N, size_t stride, IntersectContext* context); + static void intersectAOP(Scene* scene, RTCRayHit** rays, size_t N, IntersectContext* context); + static void intersectSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context); + static void intersectSOP(Scene* scene, const RTCRayHitNp* rays, size_t N, IntersectContext* context); + + static void occludedAOS(Scene* scene, RTCRay* rays, size_t N, size_t stride, IntersectContext* context); + static void occludedAOP(Scene* scene, RTCRay** rays, size_t N, IntersectContext* context); + static void occludedSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context); + static void occludedSOP(Scene* scene, const RTCRayNp* rays, size_t N, IntersectContext* context); + + private: + template + static void filterAOS(Scene* scene, void* rays, size_t N, size_t stride, IntersectContext* context); + + template + static void filterAOP(Scene* scene, void** rays, size_t N, IntersectContext* context); + + template + static void filterSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context); + + template + static void filterSOP(Scene* scene, const void* rays, size_t N, IntersectContext* context); + }; + } +}; diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h new file mode 100644 index 00000000000..baa4a8d8053 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h @@ -0,0 +1,213 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_base.h" + +namespace embree +{ + /*! BVHN AABBNode */ + template + struct AABBNode_t : public BaseNode_t + { + using BaseNode_t::children; + + struct Create + { + __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc, size_t numChildren = 0) const + { + AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + }; + + struct Set + { + __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const BBox3fa& bounds) const { + node.getAABBNode()->setRef(i,child); + node.getAABBNode()->setBounds(i,bounds); + } + }; + + struct Create2 + { + template + __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const + { + AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t), NodeRef::byteNodeAlignment); node->clear(); + for (size_t i=0; isetBounds(i,children[i].bounds()); + return NodeRef::encodeNode(node); + } + }; + + struct Set2 + { + template + __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const + { + AABBNode_t* node = ref.getAABBNode(); + for (size_t i=0; isetRef(i,children[i]); + return ref; + } + }; + + struct Set3 + { + Set3 (FastAllocator* allocator, PrimRef* prims) + : allocator(allocator), prims(prims) {} + + template + __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const + { + AABBNode_t* node = ref.getAABBNode(); + for (size_t i=0; isetRef(i,children[i]); + + if (unlikely(precord.alloc_barrier)) + { + PrimRef* begin = &prims[precord.prims.begin()]; + PrimRef* end = &prims[precord.prims.end()]; // FIXME: extended end for spatial split builder!!!!! + size_t bytes = (size_t)end - (size_t)begin; + allocator->addBlock(begin,bytes); + } + + return ref; + } + + FastAllocator* const allocator; + PrimRef* const prims; + }; + + /*! Clears the node. */ + __forceinline void clear() { + lower_x = lower_y = lower_z = pos_inf; + upper_x = upper_y = upper_z = neg_inf; + BaseNode_t::clear(); + } + + /*! Sets bounding box and ID of child. */ + __forceinline void setRef(size_t i, const NodeRef& ref) { + assert(i < N); + children[i] = ref; + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const BBox3fa& bounds) + { + assert(i < N); + lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z; + upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z; + } + + /*! Sets bounding box and ID of child. */ + __forceinline void set(size_t i, const NodeRef& ref, const BBox3fa& bounds) { + setBounds(i,bounds); + children[i] = ref; + } + + /*! Returns bounds of node. */ + __forceinline BBox3fa bounds() const { + const Vec3fa lower(reduce_min(lower_x),reduce_min(lower_y),reduce_min(lower_z)); + const Vec3fa upper(reduce_max(upper_x),reduce_max(upper_y),reduce_max(upper_z)); + return BBox3fa(lower,upper); + } + + /*! Returns bounds of specified child. */ + __forceinline BBox3fa bounds(size_t i) const + { + assert(i < N); + const Vec3fa lower(lower_x[i],lower_y[i],lower_z[i]); + const Vec3fa upper(upper_x[i],upper_y[i],upper_z[i]); + return BBox3fa(lower,upper); + } + + /*! Returns extent of bounds of specified child. */ + __forceinline Vec3fa extend(size_t i) const { + return bounds(i).size(); + } + + /*! Returns bounds of all children (implemented later as specializations) */ + __forceinline void bounds(BBox& bounds0, BBox& bounds1, BBox& bounds2, BBox& bounds3) const; + + /*! swap two children of the node */ + __forceinline void swap(size_t i, size_t j) + { + assert(ichildren[i],b->children[j]); + std::swap(a->lower_x[i],b->lower_x[j]); + std::swap(a->lower_y[i],b->lower_y[j]); + std::swap(a->lower_z[i],b->lower_z[j]); + std::swap(a->upper_x[i],b->upper_x[j]); + std::swap(a->upper_y[i],b->upper_y[j]); + std::swap(a->upper_z[i],b->upper_z[j]); + } + + /*! compacts a node (moves empty children to the end) */ + __forceinline static void compact(AABBNode_t* a) + { + /* find right most filled node */ + ssize_t j=N; + for (j=j-1; j>=0; j--) + if (a->child(j) != NodeRef::emptyNode) + break; + + /* replace empty nodes with filled nodes */ + for (ssize_t i=0; ichild(i) == NodeRef::emptyNode) { + a->swap(i,j); + for (j=j-1; j>i; j--) + if (a->child(j) != NodeRef::emptyNode) + break; + } + } + } + + /*! Returns reference to specified child */ + __forceinline NodeRef& child(size_t i) { assert(i lower_x; //!< X dimension of lower bounds of all N children. + vfloat upper_x; //!< X dimension of upper bounds of all N children. + vfloat lower_y; //!< Y dimension of lower bounds of all N children. + vfloat upper_y; //!< Y dimension of upper bounds of all N children. + vfloat lower_z; //!< Z dimension of lower bounds of all N children. + vfloat upper_z; //!< Z dimension of upper bounds of all N children. + }; + + template<> + __forceinline void AABBNode_t,4>::bounds(BBox& bounds0, BBox& bounds1, BBox& bounds2, BBox& bounds3) const { + transpose(lower_x,lower_y,lower_z,vfloat4(zero),bounds0.lower,bounds1.lower,bounds2.lower,bounds3.lower); + transpose(upper_x,upper_y,upper_z,vfloat4(zero),bounds0.upper,bounds1.upper,bounds2.upper,bounds3.upper); + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h new file mode 100644 index 00000000000..501f4bce5bd --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h @@ -0,0 +1,247 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_base.h" + +namespace embree +{ + /*! Motion Blur AABBNode */ + template + struct AABBNodeMB_t : public BaseNode_t + { + using BaseNode_t::children; + typedef BVHNodeRecord NodeRecord; + typedef BVHNodeRecordMB NodeRecordMB; + typedef BVHNodeRecordMB4D NodeRecordMB4D; + + struct Create + { + template + __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const + { + AABBNodeMB_t* node = (AABBNodeMB_t*) alloc.malloc0(sizeof(AABBNodeMB_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + }; + + struct Set + { + template + __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const + { + AABBNodeMB_t* node = ref.getAABBNodeMB(); + + LBBox3fa bounds = empty; + for (size_t i=0; isetRef(i,children[i].ref); + node->setBounds(i,children[i].lbounds); + bounds.extend(children[i].lbounds); + } + return NodeRecordMB(ref,bounds); + } + }; + + struct SetTimeRange + { + __forceinline SetTimeRange(BBox1f tbounds) : tbounds(tbounds) {} + + template + __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const + { + AABBNodeMB_t* node = ref.getAABBNodeMB(); + + LBBox3fa bounds = empty; + for (size_t i=0; isetRef(i, children[i].ref); + node->setBounds(i, children[i].lbounds, tbounds); + bounds.extend(children[i].lbounds); + } + return NodeRecordMB(ref,bounds); + } + + BBox1f tbounds; + }; + + /*! Clears the node. */ + __forceinline void clear() { + lower_x = lower_y = lower_z = vfloat(pos_inf); + upper_x = upper_y = upper_z = vfloat(neg_inf); + lower_dx = lower_dy = lower_dz = vfloat(0.0f); + upper_dx = upper_dy = upper_dz = vfloat(0.0f); + BaseNode_t::clear(); + } + + /*! Sets ID of child. */ + __forceinline void setRef(size_t i, NodeRef ref) { + children[i] = ref; + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const BBox3fa& bounds0_i, const BBox3fa& bounds1_i) + { + /*! for empty bounds we have to avoid inf-inf=nan */ + BBox3fa bounds0(min(bounds0_i.lower,Vec3fa(+FLT_MAX)),max(bounds0_i.upper,Vec3fa(-FLT_MAX))); + BBox3fa bounds1(min(bounds1_i.lower,Vec3fa(+FLT_MAX)),max(bounds1_i.upper,Vec3fa(-FLT_MAX))); + bounds0 = bounds0.enlarge_by(4.0f*float(ulp)); + bounds1 = bounds1.enlarge_by(4.0f*float(ulp)); + Vec3fa dlower = bounds1.lower-bounds0.lower; + Vec3fa dupper = bounds1.upper-bounds0.upper; + + lower_x[i] = bounds0.lower.x; lower_y[i] = bounds0.lower.y; lower_z[i] = bounds0.lower.z; + upper_x[i] = bounds0.upper.x; upper_y[i] = bounds0.upper.y; upper_z[i] = bounds0.upper.z; + + lower_dx[i] = dlower.x; lower_dy[i] = dlower.y; lower_dz[i] = dlower.z; + upper_dx[i] = dupper.x; upper_dy[i] = dupper.y; upper_dz[i] = dupper.z; + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const LBBox3fa& bounds) { + setBounds(i, bounds.bounds0, bounds.bounds1); + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds) { + setBounds(i, bounds.global(tbounds)); + } + + /*! Sets bounding box and ID of child. */ + __forceinline void set(size_t i, NodeRef ref, const BBox3fa& bounds) { + lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z; + upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z; + children[i] = ref; + } + + /*! Sets bounding box and ID of child. */ + __forceinline void set(size_t i, const NodeRecordMB4D& child) + { + setRef(i, child.ref); + setBounds(i, child.lbounds, child.dt); + } + + /*! Return bounding box for time 0 */ + __forceinline BBox3fa bounds0(size_t i) const { + return BBox3fa(Vec3fa(lower_x[i],lower_y[i],lower_z[i]), + Vec3fa(upper_x[i],upper_y[i],upper_z[i])); + } + + /*! Return bounding box for time 1 */ + __forceinline BBox3fa bounds1(size_t i) const { + return BBox3fa(Vec3fa(lower_x[i]+lower_dx[i],lower_y[i]+lower_dy[i],lower_z[i]+lower_dz[i]), + Vec3fa(upper_x[i]+upper_dx[i],upper_y[i]+upper_dy[i],upper_z[i]+upper_dz[i])); + } + + /*! Returns bounds of node. */ + __forceinline BBox3fa bounds() const { + return BBox3fa(Vec3fa(reduce_min(min(lower_x,lower_x+lower_dx)), + reduce_min(min(lower_y,lower_y+lower_dy)), + reduce_min(min(lower_z,lower_z+lower_dz))), + Vec3fa(reduce_max(max(upper_x,upper_x+upper_dx)), + reduce_max(max(upper_y,upper_y+upper_dy)), + reduce_max(max(upper_z,upper_z+upper_dz)))); + } + + /*! Return bounding box of child i */ + __forceinline BBox3fa bounds(size_t i) const { + return merge(bounds0(i),bounds1(i)); + } + + /*! Return linear bounding box of child i */ + __forceinline LBBox3fa lbounds(size_t i) const { + return LBBox3fa(bounds0(i),bounds1(i)); + } + + /*! Return bounding box of child i at specified time */ + __forceinline BBox3fa bounds(size_t i, float time) const { + return lerp(bounds0(i),bounds1(i),time); + } + + /*! Returns the expected surface area when randomly sampling the time. */ + __forceinline float expectedHalfArea(size_t i) const { + return lbounds(i).expectedHalfArea(); + } + + /*! Returns the expected surface area when randomly sampling the time. */ + __forceinline float expectedHalfArea(size_t i, const BBox1f& t0t1) const { + return lbounds(i).expectedHalfArea(t0t1); + } + + /*! swap two children of the node */ + __forceinline void swap(size_t i, size_t j) + { + assert(i=0; j--) + if (a->child(j) != NodeRef::emptyNode) + break; + + /* replace empty nodes with filled nodes */ + for (ssize_t i=0; ichild(i) == NodeRef::emptyNode) { + a->swap(i,j); + for (j=j-1; j>i; j--) + if (a->child(j) != NodeRef::emptyNode) + break; + } + } + } + + /*! Returns reference to specified child */ + __forceinline NodeRef& child(size_t i) { assert(i lower_x; //!< X dimension of lower bounds of all N children. + vfloat upper_x; //!< X dimension of upper bounds of all N children. + vfloat lower_y; //!< Y dimension of lower bounds of all N children. + vfloat upper_y; //!< Y dimension of upper bounds of all N children. + vfloat lower_z; //!< Z dimension of lower bounds of all N children. + vfloat upper_z; //!< Z dimension of upper bounds of all N children. + + vfloat lower_dx; //!< X dimension of lower bounds of all N children. + vfloat upper_dx; //!< X dimension of upper bounds of all N children. + vfloat lower_dy; //!< Y dimension of lower bounds of all N children. + vfloat upper_dy; //!< Y dimension of upper bounds of all N children. + vfloat lower_dz; //!< Z dimension of lower bounds of all N children. + vfloat upper_dz; //!< Z dimension of upper bounds of all N children. + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h new file mode 100644 index 00000000000..e968bbbc399 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h @@ -0,0 +1,107 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_aabb_mb.h" + +namespace embree +{ + /*! Aligned 4D Motion Blur Node */ + template + struct AABBNodeMB4D_t : public AABBNodeMB_t + { + using BaseNode_t::children; + using AABBNodeMB_t::set; + + typedef BVHNodeRecord NodeRecord; + typedef BVHNodeRecordMB NodeRecordMB; + typedef BVHNodeRecordMB4D NodeRecordMB4D; + + struct Create + { + template + __forceinline NodeRef operator() (BuildRecord*, const size_t, const FastAllocator::CachedAllocator& alloc, bool hasTimeSplits = true) const + { + if (hasTimeSplits) + { + AABBNodeMB4D_t* node = (AABBNodeMB4D_t*) alloc.malloc0(sizeof(AABBNodeMB4D_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + else + { + AABBNodeMB_t* node = (AABBNodeMB_t*) alloc.malloc0(sizeof(AABBNodeMB_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + } + }; + + struct Set + { + template + __forceinline void operator() (const BuildRecord&, const BuildRecord*, NodeRef ref, NodeRecordMB4D* children, const size_t num) const + { + if (likely(ref.isAABBNodeMB())) { + for (size_t i=0; iset(i, children[i]); + } else { + for (size_t i=0; iset(i, children[i]); + } + } + }; + + /*! Clears the node. */ + __forceinline void clear() { + lower_t = vfloat(pos_inf); + upper_t = vfloat(neg_inf); + AABBNodeMB_t::clear(); + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds) + { + AABBNodeMB_t::setBounds(i, bounds.global(tbounds)); + lower_t[i] = tbounds.lower; + upper_t[i] = tbounds.upper == 1.0f ? 1.0f+float(ulp) : tbounds.upper; + } + + /*! Sets bounding box and ID of child. */ + __forceinline void set(size_t i, const NodeRecordMB4D& child) { + AABBNodeMB_t::setRef(i,child.ref); + setBounds(i, child.lbounds, child.dt); + } + + /*! Returns the expected surface area when randomly sampling the time. */ + __forceinline float expectedHalfArea(size_t i) const { + return AABBNodeMB_t::lbounds(i).expectedHalfArea(timeRange(i)); + } + + /*! returns time range for specified child */ + __forceinline BBox1f timeRange(size_t i) const { + return BBox1f(lower_t[i],upper_t[i]); + } + + /*! stream output operator */ + friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB4D_t& n) + { + cout << "AABBNodeMB4D {" << embree_endl; + for (size_t i=0; i lower_t; //!< time dimension of lower bounds of all N children + vfloat upper_t; //!< time dimension of upper bounds of all N children + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h new file mode 100644 index 00000000000..8268f3b9328 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h @@ -0,0 +1,43 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_ref.h" + +namespace embree +{ + + /*! BVHN Base Node */ + template + struct BaseNode_t + { + /*! Clears the node. */ + __forceinline void clear() + { + for (size_t i=0; i + struct OBBNode_t : public BaseNode_t + { + using BaseNode_t::children; + + struct Create + { + __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const + { + OBBNode_t* node = (OBBNode_t*) alloc.malloc0(sizeof(OBBNode_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + }; + + struct Set + { + __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const OBBox3fa& bounds) const { + node.ungetAABBNode()->setRef(i,child); + node.ungetAABBNode()->setBounds(i,bounds); + } + }; + + /*! Clears the node. */ + __forceinline void clear() + { + naabb.l.vx = Vec3fa(nan); + naabb.l.vy = Vec3fa(nan); + naabb.l.vz = Vec3fa(nan); + naabb.p = Vec3fa(nan); + BaseNode_t::clear(); + } + + /*! Sets bounding box. */ + __forceinline void setBounds(size_t i, const OBBox3fa& b) + { + assert(i < N); + + AffineSpace3fa space = b.space; + space.p -= b.bounds.lower; + space = AffineSpace3fa::scale(1.0f/max(Vec3fa(1E-19f),b.bounds.upper-b.bounds.lower))*space; + + naabb.l.vx.x[i] = space.l.vx.x; + naabb.l.vx.y[i] = space.l.vx.y; + naabb.l.vx.z[i] = space.l.vx.z; + + naabb.l.vy.x[i] = space.l.vy.x; + naabb.l.vy.y[i] = space.l.vy.y; + naabb.l.vy.z[i] = space.l.vy.z; + + naabb.l.vz.x[i] = space.l.vz.x; + naabb.l.vz.y[i] = space.l.vz.y; + naabb.l.vz.z[i] = space.l.vz.z; + + naabb.p.x[i] = space.p.x; + naabb.p.y[i] = space.p.y; + naabb.p.z[i] = space.p.z; + } + + /*! Sets ID of child. */ + __forceinline void setRef(size_t i, const NodeRef& ref) { + assert(i < N); + children[i] = ref; + } + + /*! Returns the extent of the bounds of the ith child */ + __forceinline Vec3fa extent(size_t i) const { + assert(i naabb; //!< non-axis aligned bounding boxes (bounds are [0,1] in specified space) + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h new file mode 100644 index 00000000000..834cf5ec28b --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h @@ -0,0 +1,90 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_base.h" + +namespace embree +{ + template + struct OBBNodeMB_t : public BaseNode_t + { + using BaseNode_t::children; + + struct Create + { + __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const + { + OBBNodeMB_t* node = (OBBNodeMB_t*) alloc.malloc0(sizeof(OBBNodeMB_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + }; + + struct Set + { + __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const LinearSpace3fa& space, const LBBox3fa& lbounds, const BBox1f dt) const { + node.ungetAABBNodeMB()->setRef(i,child); + node.ungetAABBNodeMB()->setBounds(i,space,lbounds.global(dt)); + } + }; + + /*! Clears the node. */ + __forceinline void clear() + { + space0 = one; + //b0.lower = b0.upper = Vec3fa(nan); + b1.lower = b1.upper = Vec3fa(nan); + BaseNode_t::clear(); + } + + /*! Sets space and bounding boxes. */ + __forceinline void setBounds(size_t i, const AffineSpace3fa& space, const LBBox3fa& lbounds) { + setBounds(i,space,lbounds.bounds0,lbounds.bounds1); + } + + /*! Sets space and bounding boxes. */ + __forceinline void setBounds(size_t i, const AffineSpace3fa& s0, const BBox3fa& a, const BBox3fa& c) + { + assert(i < N); + + AffineSpace3fa space = s0; + space.p -= a.lower; + Vec3fa scale = 1.0f/max(Vec3fa(1E-19f),a.upper-a.lower); + space = AffineSpace3fa::scale(scale)*space; + BBox3fa a1((a.lower-a.lower)*scale,(a.upper-a.lower)*scale); + BBox3fa c1((c.lower-a.lower)*scale,(c.upper-a.lower)*scale); + + space0.l.vx.x[i] = space.l.vx.x; space0.l.vx.y[i] = space.l.vx.y; space0.l.vx.z[i] = space.l.vx.z; + space0.l.vy.x[i] = space.l.vy.x; space0.l.vy.y[i] = space.l.vy.y; space0.l.vy.z[i] = space.l.vy.z; + space0.l.vz.x[i] = space.l.vz.x; space0.l.vz.y[i] = space.l.vz.y; space0.l.vz.z[i] = space.l.vz.z; + space0.p .x[i] = space.p .x; space0.p .y[i] = space.p .y; space0.p .z[i] = space.p .z; + + /*b0.lower.x[i] = a1.lower.x; b0.lower.y[i] = a1.lower.y; b0.lower.z[i] = a1.lower.z; + b0.upper.x[i] = a1.upper.x; b0.upper.y[i] = a1.upper.y; b0.upper.z[i] = a1.upper.z;*/ + + b1.lower.x[i] = c1.lower.x; b1.lower.y[i] = c1.lower.y; b1.lower.z[i] = c1.lower.z; + b1.upper.x[i] = c1.upper.x; b1.upper.y[i] = c1.upper.y; b1.upper.z[i] = c1.upper.z; + } + + /*! Sets ID of child. */ + __forceinline void setRef(size_t i, const NodeRef& ref) { + assert(i < N); + children[i] = ref; + } + + /*! Returns the extent of the bounds of the ith child */ + __forceinline Vec3fa extent0(size_t i) const { + assert(i < N); + const Vec3fa vx(space0.l.vx.x[i],space0.l.vx.y[i],space0.l.vx.z[i]); + const Vec3fa vy(space0.l.vy.x[i],space0.l.vy.y[i],space0.l.vy.z[i]); + const Vec3fa vz(space0.l.vz.x[i],space0.l.vz.y[i],space0.l.vz.z[i]); + return rsqrt(vx*vx + vy*vy + vz*vz); + } + + public: + AffineSpace3vf space0; + //BBox3vf b0; // these are the unit bounds + BBox3vf b1; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h new file mode 100644 index 00000000000..5212821f3fe --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h @@ -0,0 +1,265 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_base.h" + +namespace embree +{ + /*! BVHN Quantized Node */ + template + struct __aligned(8) QuantizedBaseNode_t + { + typedef unsigned char T; + static const T MIN_QUAN = 0; + static const T MAX_QUAN = 255; + + /*! Clears the node. */ + __forceinline void clear() { + for (size_t i=0; i &lower, + const vfloat &upper, + T lower_quant[N], + T upper_quant[N], + float &start, + float &scale) + { + /* quantize bounds */ + const vbool m_valid = lower != vfloat(pos_inf); + const float minF = reduce_min(lower); + const float maxF = reduce_max(upper); + float diff = (1.0f+2.0f*float(ulp))*(maxF - minF); + float decode_scale = diff / float(MAX_QUAN); + if (decode_scale == 0.0f) decode_scale = 2.0f*FLT_MIN; // result may have been flushed to zero + assert(madd(decode_scale,float(MAX_QUAN),minF) >= maxF); + const float encode_scale = diff > 0 ? (float(MAX_QUAN) / diff) : 0.0f; + vint ilower = max(vint(floor((lower - vfloat(minF))*vfloat(encode_scale))),MIN_QUAN); + vint iupper = min(vint(ceil ((upper - vfloat(minF))*vfloat(encode_scale))),MAX_QUAN); + + /* lower/upper correction */ + vbool m_lower_correction = (madd(vfloat(ilower),decode_scale,minF)) > lower; + vbool m_upper_correction = (madd(vfloat(iupper),decode_scale,minF)) < upper; + ilower = max(select(m_lower_correction,ilower-1,ilower),MIN_QUAN); + iupper = min(select(m_upper_correction,iupper+1,iupper),MAX_QUAN); + + /* disable invalid lanes */ + ilower = select(m_valid,ilower,MAX_QUAN); + iupper = select(m_valid,iupper,MIN_QUAN); + + /* store as uchar to memory */ + vint::store(lower_quant,ilower); + vint::store(upper_quant,iupper); + start = minF; + scale = decode_scale; + +#if defined(DEBUG) + vfloat extract_lower( vint::loadu(lower_quant) ); + vfloat extract_upper( vint::loadu(upper_quant) ); + vfloat final_extract_lower = madd(extract_lower,decode_scale,minF); + vfloat final_extract_upper = madd(extract_upper,decode_scale,minF); + assert( (movemask(final_extract_lower <= lower ) & movemask(m_valid)) == movemask(m_valid)); + assert( (movemask(final_extract_upper >= upper ) & movemask(m_valid)) == movemask(m_valid)); +#endif + } + + __forceinline void init_dim(AABBNode_t,N>& node) + { + init_dim(node.lower_x,node.upper_x,lower_x,upper_x,start.x,scale.x); + init_dim(node.lower_y,node.upper_y,lower_y,upper_y,start.y,scale.y); + init_dim(node.lower_z,node.upper_z,lower_z,upper_z,start.z,scale.z); + } + + __forceinline vbool validMask() const { return vint::loadu(lower_x) <= vint::loadu(upper_x); } + +#if defined(__AVX512F__) // KNL + __forceinline vbool16 validMask16() const { return le(0xff,vint<16>::loadu(lower_x),vint<16>::loadu(upper_x)); } +#endif + __forceinline vfloat dequantizeLowerX() const { return madd(vfloat(vint::loadu(lower_x)),scale.x,vfloat(start.x)); } + + __forceinline vfloat dequantizeUpperX() const { return madd(vfloat(vint::loadu(upper_x)),scale.x,vfloat(start.x)); } + + __forceinline vfloat dequantizeLowerY() const { return madd(vfloat(vint::loadu(lower_y)),scale.y,vfloat(start.y)); } + + __forceinline vfloat dequantizeUpperY() const { return madd(vfloat(vint::loadu(upper_y)),scale.y,vfloat(start.y)); } + + __forceinline vfloat dequantizeLowerZ() const { return madd(vfloat(vint::loadu(lower_z)),scale.z,vfloat(start.z)); } + + __forceinline vfloat dequantizeUpperZ() const { return madd(vfloat(vint::loadu(upper_z)),scale.z,vfloat(start.z)); } + + template + __forceinline vfloat dequantize(const size_t offset) const { return vfloat(vint::loadu(all_planes+offset)); } + +#if defined(__AVX512F__) + __forceinline vfloat16 dequantizeLowerUpperX(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_x),p)),scale.x,vfloat16(start.x)); } + __forceinline vfloat16 dequantizeLowerUpperY(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_y),p)),scale.y,vfloat16(start.y)); } + __forceinline vfloat16 dequantizeLowerUpperZ(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_z),p)),scale.z,vfloat16(start.z)); } +#endif + + union { + struct { + T lower_x[N]; //!< 8bit discretized X dimension of lower bounds of all N children + T upper_x[N]; //!< 8bit discretized X dimension of upper bounds of all N children + T lower_y[N]; //!< 8bit discretized Y dimension of lower bounds of all N children + T upper_y[N]; //!< 8bit discretized Y dimension of upper bounds of all N children + T lower_z[N]; //!< 8bit discretized Z dimension of lower bounds of all N children + T upper_z[N]; //!< 8bit discretized Z dimension of upper bounds of all N children + }; + T all_planes[6*N]; + }; + + Vec3f start; + Vec3f scale; + + friend embree_ostream operator<<(embree_ostream o, const QuantizedBaseNode_t& n) + { + o << "QuantizedBaseNode { " << embree_endl; + o << " start " << n.start << embree_endl; + o << " scale " << n.scale << embree_endl; + o << " lower_x " << vuint::loadu(n.lower_x) << embree_endl; + o << " upper_x " << vuint::loadu(n.upper_x) << embree_endl; + o << " lower_y " << vuint::loadu(n.lower_y) << embree_endl; + o << " upper_y " << vuint::loadu(n.upper_y) << embree_endl; + o << " lower_z " << vuint::loadu(n.lower_z) << embree_endl; + o << " upper_z " << vuint::loadu(n.upper_z) << embree_endl; + o << "}" << embree_endl; + return o; + } + + }; + + template + struct __aligned(8) QuantizedNode_t : public BaseNode_t, QuantizedBaseNode_t + { + using BaseNode_t::children; + using QuantizedBaseNode_t::lower_x; + using QuantizedBaseNode_t::upper_x; + using QuantizedBaseNode_t::lower_y; + using QuantizedBaseNode_t::upper_y; + using QuantizedBaseNode_t::lower_z; + using QuantizedBaseNode_t::upper_z; + using QuantizedBaseNode_t::start; + using QuantizedBaseNode_t::scale; + using QuantizedBaseNode_t::init_dim; + + __forceinline void setRef(size_t i, const NodeRef& ref) { + assert(i < N); + children[i] = ref; + } + + struct Create2 + { + template + __forceinline NodeRef operator() (BuildRecord* children, const size_t n, const FastAllocator::CachedAllocator& alloc) const + { + __aligned(64) AABBNode_t node; + node.clear(); + for (size_t i=0; iinit(node); + + return (size_t)qnode | NodeRef::tyQuantizedNode; + } + }; + + struct Set2 + { + template + __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const + { + QuantizedNode_t* node = ref.quantizedNode(); + for (size_t i=0; isetRef(i,children[i]); + return ref; + } + }; + + __forceinline void init(AABBNode_t& node) + { + for (size_t i=0;i + struct __aligned(8) QuantizedBaseNodeMB_t + { + QuantizedBaseNode_t node0; + QuantizedBaseNode_t node1; + + /*! Clears the node. */ + __forceinline void clear() { + node0.clear(); + node1.clear(); + } + + /*! Returns bounds of specified child. */ + __forceinline BBox3fa bounds(size_t i) const + { + assert(i < N); + BBox3fa bounds0 = node0.bounds(i); + BBox3fa bounds1 = node1.bounds(i); + bounds0.extend(bounds1); + return bounds0; + } + + /*! Returns extent of bounds of specified child. */ + __forceinline Vec3fa extent(size_t i) const { + return bounds(i).size(); + } + + __forceinline vbool validMask() const { return node0.validMask(); } + + template + __forceinline vfloat dequantizeLowerX(const T t) const { return lerp(node0.dequantizeLowerX(),node1.dequantizeLowerX(),t); } + template + __forceinline vfloat dequantizeUpperX(const T t) const { return lerp(node0.dequantizeUpperX(),node1.dequantizeUpperX(),t); } + template + __forceinline vfloat dequantizeLowerY(const T t) const { return lerp(node0.dequantizeLowerY(),node1.dequantizeLowerY(),t); } + template + __forceinline vfloat dequantizeUpperY(const T t) const { return lerp(node0.dequantizeUpperY(),node1.dequantizeUpperY(),t); } + template + __forceinline vfloat dequantizeLowerZ(const T t) const { return lerp(node0.dequantizeLowerZ(),node1.dequantizeLowerZ(),t); } + template + __forceinline vfloat dequantizeUpperZ(const T t) const { return lerp(node0.dequantizeUpperZ(),node1.dequantizeUpperZ(),t); } + + + template + __forceinline vfloat dequantizeLowerX(const size_t i, const vfloat &t) const { return lerp(vfloat(node0.dequantizeLowerX()[i]),vfloat(node1.dequantizeLowerX()[i]),t); } + template + __forceinline vfloat dequantizeUpperX(const size_t i, const vfloat &t) const { return lerp(vfloat(node0.dequantizeUpperX()[i]),vfloat(node1.dequantizeUpperX()[i]),t); } + template + __forceinline vfloat dequantizeLowerY(const size_t i, const vfloat &t) const { return lerp(vfloat(node0.dequantizeLowerY()[i]),vfloat(node1.dequantizeLowerY()[i]),t); } + template + __forceinline vfloat dequantizeUpperY(const size_t i, const vfloat &t) const { return lerp(vfloat(node0.dequantizeUpperY()[i]),vfloat(node1.dequantizeUpperY()[i]),t); } + template + __forceinline vfloat dequantizeLowerZ(const size_t i, const vfloat &t) const { return lerp(vfloat(node0.dequantizeLowerZ()[i]),vfloat(node1.dequantizeLowerZ()[i]),t); } + template + __forceinline vfloat dequantizeUpperZ(const size_t i, const vfloat &t) const { return lerp(vfloat(node0.dequantizeUpperZ()[i]),vfloat(node1.dequantizeUpperZ()[i]),t); } + + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h new file mode 100644 index 00000000000..0f6d4dac7e7 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h @@ -0,0 +1,242 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "../common/alloc.h" +#include "../common/accel.h" +#include "../common/device.h" +#include "../common/scene.h" +#include "../geometry/primitive.h" +#include "../common/ray.h" + +namespace embree +{ + /* BVH node reference with bounds */ + template + struct BVHNodeRecord + { + __forceinline BVHNodeRecord() {} + __forceinline BVHNodeRecord(NodeRef ref, const BBox3fa& bounds) : ref(ref), bounds((BBox3fx)bounds) {} + __forceinline BVHNodeRecord(NodeRef ref, const BBox3fx& bounds) : ref(ref), bounds(bounds) {} + + NodeRef ref; + BBox3fx bounds; + }; + + template + struct BVHNodeRecordMB + { + __forceinline BVHNodeRecordMB() {} + __forceinline BVHNodeRecordMB(NodeRef ref, const LBBox3fa& lbounds) : ref(ref), lbounds(lbounds) {} + + NodeRef ref; + LBBox3fa lbounds; + }; + + template + struct BVHNodeRecordMB4D + { + __forceinline BVHNodeRecordMB4D() {} + __forceinline BVHNodeRecordMB4D(NodeRef ref, const LBBox3fa& lbounds, const BBox1f& dt) : ref(ref), lbounds(lbounds), dt(dt) {} + + NodeRef ref; + LBBox3fa lbounds; + BBox1f dt; + }; + + template struct BaseNode_t; + template struct AABBNode_t; + template struct AABBNodeMB_t; + template struct AABBNodeMB4D_t; + template struct OBBNode_t; + template struct OBBNodeMB_t; + template struct QuantizedNode_t; + template struct QuantizedNodeMB_t; + + /*! Pointer that points to a node or a list of primitives */ + template + struct NodeRefPtr + { + //template friend class BVHN; + + /*! Number of bytes the nodes and primitives are minimally aligned to.*/ + static const size_t byteAlignment = 16; + static const size_t byteNodeAlignment = 4*N; + + /*! highest address bit is used as barrier for some algorithms */ + static const size_t barrier_mask = (1LL << (8*sizeof(size_t)-1)); + + /*! Masks the bits that store the number of items per leaf. */ + static const size_t align_mask = byteAlignment-1; + static const size_t items_mask = byteAlignment-1; + + /*! different supported node types */ + static const size_t tyAABBNode = 0; + static const size_t tyAABBNodeMB = 1; + static const size_t tyAABBNodeMB4D = 6; + static const size_t tyOBBNode = 2; + static const size_t tyOBBNodeMB = 3; + static const size_t tyQuantizedNode = 5; + static const size_t tyLeaf = 8; + + /*! Empty node */ + static const size_t emptyNode = tyLeaf; + + /*! Invalid node, used as marker in traversal */ + static const size_t invalidNode = (((size_t)-1) & (~items_mask)) | (tyLeaf+0); + static const size_t popRay = (((size_t)-1) & (~items_mask)) | (tyLeaf+1); + + /*! Maximum number of primitive blocks in a leaf. */ + static const size_t maxLeafBlocks = items_mask-tyLeaf; + + /*! Default constructor */ + __forceinline NodeRefPtr () {} + + /*! Construction from integer */ + __forceinline NodeRefPtr (size_t ptr) : ptr(ptr) {} + + /*! Cast to size_t */ + __forceinline operator size_t() const { return ptr; } + + /*! Sets the barrier bit. */ + __forceinline void setBarrier() { +#if defined(__X86_64__) || defined(__aarch64__) + assert(!isBarrier()); + ptr |= barrier_mask; +#else + assert(false); +#endif + } + + /*! Clears the barrier bit. */ + __forceinline void clearBarrier() { +#if defined(__X86_64__) || defined(__aarch64__) + ptr &= ~barrier_mask; +#else + assert(false); +#endif + } + + /*! Checks if this is an barrier. A barrier tells the top level tree rotations how deep to enter the tree. */ + __forceinline bool isBarrier() const { return (ptr & barrier_mask) != 0; } + + /*! checks if this is a leaf */ + __forceinline size_t isLeaf() const { return ptr & tyLeaf; } + + /*! returns node type */ + __forceinline int type() const { return ptr & (size_t)align_mask; } + + /*! checks if this is a node */ + __forceinline int isAABBNode() const { return (ptr & (size_t)align_mask) == tyAABBNode; } + + /*! checks if this is a motion blur node */ + __forceinline int isAABBNodeMB() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB; } + + /*! checks if this is a 4D motion blur node */ + __forceinline int isAABBNodeMB4D() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB4D; } + + /*! checks if this is a node with unaligned bounding boxes */ + __forceinline int isOBBNode() const { return (ptr & (size_t)align_mask) == tyOBBNode; } + + /*! checks if this is a motion blur node with unaligned bounding boxes */ + __forceinline int isOBBNodeMB() const { return (ptr & (size_t)align_mask) == tyOBBNodeMB; } + + /*! checks if this is a quantized node */ + __forceinline int isQuantizedNode() const { return (ptr & (size_t)align_mask) == tyQuantizedNode; } + + /*! Encodes a node */ + static __forceinline NodeRefPtr encodeNode(AABBNode_t* node) { + assert(!((size_t)node & align_mask)); + return NodeRefPtr((size_t) node); + } + + static __forceinline NodeRefPtr encodeNode(AABBNodeMB_t* node) { + assert(!((size_t)node & align_mask)); + return NodeRefPtr((size_t) node | tyAABBNodeMB); + } + + static __forceinline NodeRefPtr encodeNode(AABBNodeMB4D_t* node) { + assert(!((size_t)node & align_mask)); + return NodeRefPtr((size_t) node | tyAABBNodeMB4D); + } + + /*! Encodes an unaligned node */ + static __forceinline NodeRefPtr encodeNode(OBBNode_t* node) { + return NodeRefPtr((size_t) node | tyOBBNode); + } + + /*! Encodes an unaligned motion blur node */ + static __forceinline NodeRefPtr encodeNode(OBBNodeMB_t* node) { + return NodeRefPtr((size_t) node | tyOBBNodeMB); + } + + /*! Encodes a leaf */ + static __forceinline NodeRefPtr encodeLeaf(void* tri, size_t num) { + assert(!((size_t)tri & align_mask)); + assert(num <= maxLeafBlocks); + return NodeRefPtr((size_t)tri | (tyLeaf+min(num,(size_t)maxLeafBlocks))); + } + + /*! Encodes a leaf */ + static __forceinline NodeRefPtr encodeTypedLeaf(void* ptr, size_t ty) { + assert(!((size_t)ptr & align_mask)); + return NodeRefPtr((size_t)ptr | (tyLeaf+ty)); + } + + /*! returns base node pointer */ + __forceinline BaseNode_t* baseNode() + { + assert(!isLeaf()); + return (BaseNode_t*)(ptr & ~(size_t)align_mask); + } + __forceinline const BaseNode_t* baseNode() const + { + assert(!isLeaf()); + return (const BaseNode_t*)(ptr & ~(size_t)align_mask); + } + + /*! returns node pointer */ + __forceinline AABBNode_t* getAABBNode() { assert(isAABBNode()); return ( AABBNode_t*)ptr; } + __forceinline const AABBNode_t* getAABBNode() const { assert(isAABBNode()); return (const AABBNode_t*)ptr; } + + /*! returns motion blur node pointer */ + __forceinline AABBNodeMB_t* getAABBNodeMB() { assert(isAABBNodeMB() || isAABBNodeMB4D()); return ( AABBNodeMB_t*)(ptr & ~(size_t)align_mask); } + __forceinline const AABBNodeMB_t* getAABBNodeMB() const { assert(isAABBNodeMB() || isAABBNodeMB4D()); return (const AABBNodeMB_t*)(ptr & ~(size_t)align_mask); } + + /*! returns 4D motion blur node pointer */ + __forceinline AABBNodeMB4D_t* getAABBNodeMB4D() { assert(isAABBNodeMB4D()); return ( AABBNodeMB4D_t*)(ptr & ~(size_t)align_mask); } + __forceinline const AABBNodeMB4D_t* getAABBNodeMB4D() const { assert(isAABBNodeMB4D()); return (const AABBNodeMB4D_t*)(ptr & ~(size_t)align_mask); } + + /*! returns unaligned node pointer */ + __forceinline OBBNode_t* ungetAABBNode() { assert(isOBBNode()); return ( OBBNode_t*)(ptr & ~(size_t)align_mask); } + __forceinline const OBBNode_t* ungetAABBNode() const { assert(isOBBNode()); return (const OBBNode_t*)(ptr & ~(size_t)align_mask); } + + /*! returns unaligned motion blur node pointer */ + __forceinline OBBNodeMB_t* ungetAABBNodeMB() { assert(isOBBNodeMB()); return ( OBBNodeMB_t*)(ptr & ~(size_t)align_mask); } + __forceinline const OBBNodeMB_t* ungetAABBNodeMB() const { assert(isOBBNodeMB()); return (const OBBNodeMB_t*)(ptr & ~(size_t)align_mask); } + + /*! returns quantized node pointer */ + __forceinline QuantizedNode_t* quantizedNode() { assert(isQuantizedNode()); return ( QuantizedNode_t*)(ptr & ~(size_t)align_mask ); } + __forceinline const QuantizedNode_t* quantizedNode() const { assert(isQuantizedNode()); return (const QuantizedNode_t*)(ptr & ~(size_t)align_mask ); } + + /*! returns leaf pointer */ + __forceinline char* leaf(size_t& num) const { + assert(isLeaf()); + num = (ptr & (size_t)items_mask)-tyLeaf; + return (char*)(ptr & ~(size_t)align_mask); + } + + /*! clear all bit flags */ + __forceinline void clearFlags() { + ptr &= ~(size_t)align_mask; + } + + /*! returns the wideness */ + __forceinline size_t getN() const { return N; } + + public: + size_t ptr; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp new file mode 100644 index 00000000000..a273c21e8ba --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp @@ -0,0 +1,247 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_refit.h" +#include "bvh_statistics.h" + +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" + +namespace embree +{ + namespace isa + { + static const size_t SINGLE_THREAD_THRESHOLD = 4*1024; + + template + __forceinline bool compare(const typename BVHN::NodeRef* a, const typename BVHN::NodeRef* b) + { + size_t sa = *(size_t*)&a->node()->lower_x; + size_t sb = *(size_t*)&b->node()->lower_x; + return sa < sb; + } + + template + BVHNRefitter::BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds) + : bvh(bvh), leafBounds(leafBounds), numSubTrees(0) + { + } + + template + void BVHNRefitter::refit() + { + if (bvh->numPrimitives <= SINGLE_THREAD_THRESHOLD) { + bvh->bounds = LBBox3fa(recurse_bottom(bvh->root)); + } + else + { + BBox3fa subTreeBounds[MAX_NUM_SUB_TREES]; + numSubTrees = 0; + gather_subtree_refs(bvh->root,numSubTrees,0); + if (numSubTrees) + parallel_for(size_t(0), numSubTrees, size_t(1), [&](const range& r) { + for (size_t i=r.begin(); ibounds = LBBox3fa(refit_toplevel(bvh->root,numSubTrees,subTreeBounds,0)); + } + } + + template + void BVHNRefitter::gather_subtree_refs(NodeRef& ref, + size_t &subtrees, + const size_t depth) + { + if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) + { + assert(subtrees < MAX_NUM_SUB_TREES); + subTrees[subtrees++] = ref; + return; + } + + if (ref.isAABBNode()) + { + AABBNode* node = ref.getAABBNode(); + for (size_t i=0; ichild(i); + if (unlikely(child == BVH::emptyNode)) continue; + gather_subtree_refs(child,subtrees,depth+1); + } + } + } + + template + BBox3fa BVHNRefitter::refit_toplevel(NodeRef& ref, + size_t &subtrees, + const BBox3fa *const subTreeBounds, + const size_t depth) + { + if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) + { + assert(subtrees < MAX_NUM_SUB_TREES); + assert(subTrees[subtrees] == ref); + return subTreeBounds[subtrees++]; + } + + if (ref.isAABBNode()) + { + AABBNode* node = ref.getAABBNode(); + BBox3fa bounds[N]; + + for (size_t i=0; ichild(i); + + if (unlikely(child == BVH::emptyNode)) + bounds[i] = BBox3fa(empty); + else + bounds[i] = refit_toplevel(child,subtrees,subTreeBounds,depth+1); + } + + BBox3vf boundsT = transpose(bounds); + + /* set new bounds */ + node->lower_x = boundsT.lower.x; + node->lower_y = boundsT.lower.y; + node->lower_z = boundsT.lower.z; + node->upper_x = boundsT.upper.x; + node->upper_y = boundsT.upper.y; + node->upper_z = boundsT.upper.z; + + return merge(bounds); + } + else + return leafBounds.leafBounds(ref); + } + + // ========================================================= + // ========================================================= + // ========================================================= + + + template + BBox3fa BVHNRefitter::recurse_bottom(NodeRef& ref) + { + /* this is a leaf node */ + if (unlikely(ref.isLeaf())) + return leafBounds.leafBounds(ref); + + /* recurse if this is an internal node */ + AABBNode* node = ref.getAABBNode(); + + /* enable exclusive prefetch for >= AVX platforms */ +#if defined(__AVX__) + BVH::prefetchW(ref); +#endif + BBox3fa bounds[N]; + + for (size_t i=0; ichild(i) == BVH::emptyNode)) + { + bounds[i] = BBox3fa(empty); + } + else + bounds[i] = recurse_bottom(node->child(i)); + + /* AOS to SOA transform */ + BBox3vf boundsT = transpose(bounds); + + /* set new bounds */ + node->lower_x = boundsT.lower.x; + node->lower_y = boundsT.lower.y; + node->lower_z = boundsT.lower.z; + node->upper_x = boundsT.upper.x; + node->upper_y = boundsT.upper.y; + node->upper_z = boundsT.upper.z; + + return merge(bounds); + } + + template + BVHNRefitT::BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode) + : bvh(bvh), builder(builder), refitter(new BVHNRefitter(bvh,*(typename BVHNRefitter::LeafBoundsInterface*)this)), mesh(mesh), topologyVersion(0) {} + + template + void BVHNRefitT::clear() + { + if (builder) + builder->clear(); + } + + template + void BVHNRefitT::build() + { + if (mesh->topologyChanged(topologyVersion)) { + topologyVersion = mesh->getTopologyVersion(); + builder->build(); + } + else + refitter->refit(); + } + + template class BVHNRefitter<4>; +#if defined(__AVX__) + template class BVHNRefitter<8>; +#endif + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + + Builder* BVH4Triangle4MeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4> ((BVH4*)accel,BVH4Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); } + Builder* BVH4Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4v>((BVH4*)accel,BVH4Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } + Builder* BVH4Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4i>((BVH4*)accel,BVH4Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } +#if defined(__AVX__) + Builder* BVH8Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + + Builder* BVH8Triangle4MeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4> ((BVH8*)accel,BVH8Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); } + Builder* BVH8Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4v>((BVH8*)accel,BVH8Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } + Builder* BVH8Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4i>((BVH8*)accel,BVH8Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH4Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,QuadMesh,Quad4v>((BVH4*)accel,BVH4Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } + +#if defined(__AVX__) + Builder* BVH8Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH8Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,QuadMesh,Quad4v>((BVH8*)accel,BVH8Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } +#endif + +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH4VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode); + Builder* BVH4VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,UserGeometry,Object>((BVH4*)accel,BVH4VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } + +#if defined(__AVX__) + Builder* BVH8VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode); + Builder* BVH8VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,UserGeometry,Object>((BVH8*)accel,BVH8VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode); + Builder* BVH4InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,Instance,InstancePrimitive>((BVH4*)accel,BVH4InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); } + +#if defined(__AVX__) + Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode); + Builder* BVH8InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,Instance,InstancePrimitive>((BVH8*)accel,BVH8InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); } +#endif +#endif + + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h new file mode 100644 index 00000000000..4aa9bdd7cc8 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h @@ -0,0 +1,95 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../bvh/bvh.h" + +namespace embree +{ + namespace isa + { + template + class BVHNRefitter + { + public: + + /*! Type shortcuts */ + typedef BVHN BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::NodeRef NodeRef; + + struct LeafBoundsInterface { + virtual const BBox3fa leafBounds(NodeRef& ref) const = 0; + }; + + public: + + /*! Constructor. */ + BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds); + + /*! refits the BVH */ + void refit(); + + private: + /* single-threaded subtree extraction based on BVH depth */ + void gather_subtree_refs(NodeRef& ref, + size_t &subtrees, + const size_t depth = 0); + + /* single-threaded top-level refit */ + BBox3fa refit_toplevel(NodeRef& ref, + size_t &subtrees, + const BBox3fa *const subTreeBounds, + const size_t depth = 0); + + /* single-threaded subtree refit */ + BBox3fa recurse_bottom(NodeRef& ref); + + public: + BVH* bvh; //!< BVH to refit + const LeafBoundsInterface& leafBounds; //!< calculates bounds of leaves + + static const size_t MAX_SUB_TREE_EXTRACTION_DEPTH = (N==4) ? 4 : (N==8) ? 3 : 3; + static const size_t MAX_NUM_SUB_TREES = (N==4) ? 256 : (N==8) ? 512 : N*N*N; // N ^ MAX_SUB_TREE_EXTRACTION_DEPTH + size_t numSubTrees; + NodeRef subTrees[MAX_NUM_SUB_TREES]; + }; + + template + class BVHNRefitT : public Builder, public BVHNRefitter::LeafBoundsInterface + { + public: + + /*! Type shortcuts */ + typedef BVHN BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::NodeRef NodeRef; + + public: + BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode); + + virtual void build(); + + virtual void clear(); + + virtual const BBox3fa leafBounds (NodeRef& ref) const + { + size_t num; char* prim = ref.leaf(num); + if (unlikely(ref == BVH::emptyNode)) return empty; + + BBox3fa bounds = empty; + for (size_t i=0; i builder; + std::unique_ptr> refitter; + Mesh* mesh; + unsigned int topologyVersion; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp new file mode 100644 index 00000000000..2bb431bf0e4 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp @@ -0,0 +1,127 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_rotate.h" + +namespace embree +{ + namespace isa + { + /*! Computes half surface area of box. */ + __forceinline float halfArea3f(const BBox& box) { + const vfloat4 d = box.size(); + const vfloat4 a = d*shuffle<1,2,0,3>(d); + return a[0]+a[1]+a[2]; + } + + size_t BVHNRotate<4>::rotate(NodeRef parentRef, size_t depth) + { + /*! nothing to rotate if we reached a leaf node. */ + if (parentRef.isBarrier()) return 0; + if (parentRef.isLeaf()) return 0; + AABBNode* parent = parentRef.getAABBNode(); + + /*! rotate all children first */ + vint4 cdepth; + for (size_t c=0; c<4; c++) + cdepth[c] = (int)rotate(parent->child(c),depth+1); + + /* compute current areas of all children */ + vfloat4 sizeX = parent->upper_x-parent->lower_x; + vfloat4 sizeY = parent->upper_y-parent->lower_y; + vfloat4 sizeZ = parent->upper_z-parent->lower_z; + vfloat4 childArea = madd(sizeX,(sizeY + sizeZ),sizeY*sizeZ); + + /*! get node bounds */ + BBox child1_0,child1_1,child1_2,child1_3; + parent->bounds(child1_0,child1_1,child1_2,child1_3); + + /*! Find best rotation. We pick a first child (child1) and a sub-child + (child2child) of a different second child (child2), and swap child1 + and child2child. We perform the best such swap. */ + float bestArea = 0; + size_t bestChild1 = -1, bestChild2 = -1, bestChild2Child = -1; + for (size_t c2=0; c2<4; c2++) + { + /*! ignore leaf nodes as we cannot descent into them */ + if (parent->child(c2).isBarrier()) continue; + if (parent->child(c2).isLeaf()) continue; + AABBNode* child2 = parent->child(c2).getAABBNode(); + + /*! transpose child bounds */ + BBox child2c0,child2c1,child2c2,child2c3; + child2->bounds(child2c0,child2c1,child2c2,child2c3); + + /*! put child1_0 at each child2 position */ + float cost00 = halfArea3f(merge(child1_0,child2c1,child2c2,child2c3)); + float cost01 = halfArea3f(merge(child2c0,child1_0,child2c2,child2c3)); + float cost02 = halfArea3f(merge(child2c0,child2c1,child1_0,child2c3)); + float cost03 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_0)); + vfloat4 cost0 = vfloat4(cost00,cost01,cost02,cost03); + vfloat4 min0 = vreduce_min(cost0); + int pos0 = (int)bsf(movemask(min0 == cost0)); + + /*! put child1_1 at each child2 position */ + float cost10 = halfArea3f(merge(child1_1,child2c1,child2c2,child2c3)); + float cost11 = halfArea3f(merge(child2c0,child1_1,child2c2,child2c3)); + float cost12 = halfArea3f(merge(child2c0,child2c1,child1_1,child2c3)); + float cost13 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_1)); + vfloat4 cost1 = vfloat4(cost10,cost11,cost12,cost13); + vfloat4 min1 = vreduce_min(cost1); + int pos1 = (int)bsf(movemask(min1 == cost1)); + + /*! put child1_2 at each child2 position */ + float cost20 = halfArea3f(merge(child1_2,child2c1,child2c2,child2c3)); + float cost21 = halfArea3f(merge(child2c0,child1_2,child2c2,child2c3)); + float cost22 = halfArea3f(merge(child2c0,child2c1,child1_2,child2c3)); + float cost23 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_2)); + vfloat4 cost2 = vfloat4(cost20,cost21,cost22,cost23); + vfloat4 min2 = vreduce_min(cost2); + int pos2 = (int)bsf(movemask(min2 == cost2)); + + /*! put child1_3 at each child2 position */ + float cost30 = halfArea3f(merge(child1_3,child2c1,child2c2,child2c3)); + float cost31 = halfArea3f(merge(child2c0,child1_3,child2c2,child2c3)); + float cost32 = halfArea3f(merge(child2c0,child2c1,child1_3,child2c3)); + float cost33 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_3)); + vfloat4 cost3 = vfloat4(cost30,cost31,cost32,cost33); + vfloat4 min3 = vreduce_min(cost3); + int pos3 = (int)bsf(movemask(min3 == cost3)); + + /*! find best other child */ + vfloat4 area0123 = vfloat4(extract<0>(min0),extract<0>(min1),extract<0>(min2),extract<0>(min3)) - vfloat4(childArea[c2]); + int pos[4] = { pos0,pos1,pos2,pos3 }; + const size_t mbd = BVH4::maxBuildDepth; + vbool4 valid = vint4(int(depth+1))+cdepth <= vint4(mbd); // only select swaps that fulfill depth constraints + valid &= vint4(int(c2)) != vint4(step); + if (none(valid)) continue; + size_t c1 = select_min(valid,area0123); + float area = area0123[c1]; + if (c1 == c2) continue; // can happen if bounds are NANs + + /*! accept a swap when it reduces cost and is not swapping a node with itself */ + if (area < bestArea) { + bestArea = area; + bestChild1 = c1; + bestChild2 = c2; + bestChild2Child = pos[c1]; + } + } + + /*! if we did not find a swap that improves the SAH then do nothing */ + if (bestChild1 == size_t(-1)) return 1+reduce_max(cdepth); + + /*! perform the best found tree rotation */ + AABBNode* child2 = parent->child(bestChild2).getAABBNode(); + AABBNode::swap(parent,bestChild1,child2,bestChild2Child); + parent->setBounds(bestChild2,child2->bounds()); + AABBNode::compact(parent); + AABBNode::compact(child2); + + /*! This returned depth is conservative as the child that was + * pulled up in the tree could have been on the critical path. */ + cdepth[bestChild1]++; // bestChild1 was pushed down one level + return 1+reduce_max(cdepth); + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h new file mode 100644 index 00000000000..009bef339ee --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h @@ -0,0 +1,37 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" + +namespace embree +{ + namespace isa + { + template + class BVHNRotate + { + typedef typename BVHN::NodeRef NodeRef; + + public: + static const bool enabled = false; + + static __forceinline size_t rotate(NodeRef parentRef, size_t depth = 1) { return 0; } + static __forceinline void restructure(NodeRef ref, size_t depth = 1) {} + }; + + /* BVH4 tree rotations */ + template<> + class BVHNRotate<4> + { + typedef BVH4::AABBNode AABBNode; + typedef BVH4::NodeRef NodeRef; + + public: + static const bool enabled = true; + + static size_t rotate(NodeRef parentRef, size_t depth = 1); + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp new file mode 100644 index 00000000000..aa560350269 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp @@ -0,0 +1,168 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_statistics.h" +#include "../../common/algorithms/parallel_reduce.h" + +namespace embree +{ + template + BVHNStatistics::BVHNStatistics (BVH* bvh) : bvh(bvh) + { + double A = max(0.0f,bvh->getLinearBounds().expectedHalfArea()); + stat = statistics(bvh->root,A,BBox1f(0.0f,1.0f)); + } + + template + std::string BVHNStatistics::str() + { + std::ostringstream stream; + stream.setf(std::ios::fixed, std::ios::floatfield); + stream << " primitives = " << bvh->numPrimitives << ", vertices = " << bvh->numVertices << ", depth = " << stat.depth << std::endl; + size_t totalBytes = stat.bytes(bvh); + double totalSAH = stat.sah(bvh); + stream << " total : sah = " << std::setw(7) << std::setprecision(3) << totalSAH << " (100.00%), "; + stream << "#bytes = " << std::setw(7) << std::setprecision(2) << totalBytes/1E6 << " MB (100.00%), "; + stream << "#nodes = " << std::setw(7) << stat.size() << " (" << std::setw(6) << std::setprecision(2) << 100.0*stat.fillRate(bvh) << "% filled), "; + stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(totalBytes)/double(bvh->numPrimitives) << std::endl; + if (stat.statAABBNodes.numNodes ) stream << " getAABBNodes : " << stat.statAABBNodes.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statOBBNodes.numNodes ) stream << " ungetAABBNodes : " << stat.statOBBNodes.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statAABBNodesMB.numNodes ) stream << " getAABBNodesMB : " << stat.statAABBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statAABBNodesMB4D.numNodes) stream << " getAABBNodesMB4D : " << stat.statAABBNodesMB4D.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statOBBNodesMB.numNodes) stream << " ungetAABBNodesMB : " << stat.statOBBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statQuantizedNodes.numNodes ) stream << " quantizedNodes : " << stat.statQuantizedNodes.toString(bvh,totalSAH,totalBytes) << std::endl; + if (true) stream << " leaves : " << stat.statLeaf.toString(bvh,totalSAH,totalBytes) << std::endl; + if (true) stream << " histogram : " << stat.statLeaf.histToString() << std::endl; + return stream.str(); + } + + template + typename BVHNStatistics::Statistics BVHNStatistics::statistics(NodeRef node, const double A, const BBox1f t0t1) + { + Statistics s; + assert(t0t1.size() > 0.0f); + double dt = max(0.0f,t0t1.size()); + if (node.isAABBNode()) + { + AABBNode* n = node.getAABBNode(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,halfArea(n->extend(i))); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statAABBNodes.numChildren++; + return s; + }, Statistics::add); + s.statAABBNodes.numNodes++; + s.statAABBNodes.nodeSAH += dt*A; + s.depth++; + } + else if (node.isOBBNode()) + { + OBBNode* n = node.ungetAABBNode(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,halfArea(n->extent(i))); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statOBBNodes.numChildren++; + return s; + }, Statistics::add); + s.statOBBNodes.numNodes++; + s.statOBBNodes.nodeSAH += dt*A; + s.depth++; + } + else if (node.isAABBNodeMB()) + { + AABBNodeMB* n = node.getAABBNodeMB(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,n->expectedHalfArea(i,t0t1)); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statAABBNodesMB.numChildren++; + return s; + }, Statistics::add); + s.statAABBNodesMB.numNodes++; + s.statAABBNodesMB.nodeSAH += dt*A; + s.depth++; + } + else if (node.isAABBNodeMB4D()) + { + AABBNodeMB4D* n = node.getAABBNodeMB4D(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const BBox1f t0t1i = intersect(t0t1,n->timeRange(i)); + assert(!t0t1i.empty()); + const double Ai = n->AABBNodeMB::expectedHalfArea(i,t0t1i); + Statistics s = statistics(n->child(i),Ai,t0t1i); + s.statAABBNodesMB4D.numChildren++; + return s; + }, Statistics::add); + s.statAABBNodesMB4D.numNodes++; + s.statAABBNodesMB4D.nodeSAH += dt*A; + s.depth++; + } + else if (node.isOBBNodeMB()) + { + OBBNodeMB* n = node.ungetAABBNodeMB(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,halfArea(n->extent0(i))); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statOBBNodesMB.numChildren++; + return s; + }, Statistics::add); + s.statOBBNodesMB.numNodes++; + s.statOBBNodesMB.nodeSAH += dt*A; + s.depth++; + } + else if (node.isQuantizedNode()) + { + QuantizedNode* n = node.quantizedNode(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,halfArea(n->extent(i))); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statQuantizedNodes.numChildren++; + return s; + }, Statistics::add); + s.statQuantizedNodes.numNodes++; + s.statQuantizedNodes.nodeSAH += dt*A; + s.depth++; + } + else if (node.isLeaf()) + { + size_t num; const char* tri = node.leaf(num); + if (num) + { + for (size_t i=0; iprimTy->getBytes(tri); + s.statLeaf.numPrimsActive += bvh->primTy->sizeActive(tri); + s.statLeaf.numPrimsTotal += bvh->primTy->sizeTotal(tri); + s.statLeaf.numBytes += bytes; + tri+=bytes; + } + s.statLeaf.numLeaves++; + s.statLeaf.numPrimBlocks += num; + s.statLeaf.leafSAH += dt*A*num; + if (num-1 < Statistics::LeafStat::NHIST) { + s.statLeaf.numPrimBlocksHistogram[num-1]++; + } + } + } + else { + // -- GODOT start -- + // throw std::runtime_error("not supported node type in bvh_statistics"); + abort(); + // -- GODOT end -- + } + return s; + } + +#if defined(__AVX__) + template class BVHNStatistics<8>; +#endif + +#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__) + template class BVHNStatistics<4>; +#endif +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h new file mode 100644 index 00000000000..73dfc6fbcc0 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h @@ -0,0 +1,285 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include + +namespace embree +{ + template + class BVHNStatistics + { + typedef BVHN BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::OBBNode OBBNode; + typedef typename BVH::AABBNodeMB AABBNodeMB; + typedef typename BVH::AABBNodeMB4D AABBNodeMB4D; + typedef typename BVH::OBBNodeMB OBBNodeMB; + typedef typename BVH::QuantizedNode QuantizedNode; + + typedef typename BVH::NodeRef NodeRef; + + struct Statistics + { + template + struct NodeStat + { + NodeStat ( double nodeSAH = 0, + size_t numNodes = 0, + size_t numChildren = 0) + : nodeSAH(nodeSAH), + numNodes(numNodes), + numChildren(numChildren) {} + + double sah(BVH* bvh) const { + return nodeSAH/bvh->getLinearBounds().expectedHalfArea(); + } + + size_t bytes() const { + return numNodes*sizeof(Node); + } + + size_t size() const { + return numNodes; + } + + double fillRateNom () const { return double(numChildren); } + double fillRateDen () const { return double(numNodes*N); } + double fillRate () const { return fillRateNom()/fillRateDen(); } + + __forceinline friend NodeStat operator+ ( const NodeStat& a, const NodeStat& b) + { + return NodeStat(a.nodeSAH + b.nodeSAH, + a.numNodes+b.numNodes, + a.numChildren+b.numChildren); + } + + std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const + { + std::ostringstream stream; + stream.setf(std::ios::fixed, std::ios::floatfield); + stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh); + stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), "; + stream << "#bytes = " << std::setw(7) << std::setprecision(2) << bytes()/1E6 << " MB "; + stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes())/double(bytesTotal) << "%), "; + stream << "#nodes = " << std::setw(7) << numNodes << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate() << "% filled), "; + stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes())/double(bvh->numPrimitives); + return stream.str(); + } + + public: + double nodeSAH; + size_t numNodes; + size_t numChildren; + }; + + struct LeafStat + { + static const int NHIST = 8; + + LeafStat ( double leafSAH = 0.0f, + size_t numLeaves = 0, + size_t numPrimsActive = 0, + size_t numPrimsTotal = 0, + size_t numPrimBlocks = 0, + size_t numBytes = 0) + : leafSAH(leafSAH), + numLeaves(numLeaves), + numPrimsActive(numPrimsActive), + numPrimsTotal(numPrimsTotal), + numPrimBlocks(numPrimBlocks), + numBytes(numBytes) + { + for (size_t i=0; igetLinearBounds().expectedHalfArea(); + } + + size_t bytes(BVH* bvh) const { + return numBytes; + } + + size_t size() const { + return numLeaves; + } + + double fillRateNom (BVH* bvh) const { return double(numPrimsActive); } + double fillRateDen (BVH* bvh) const { return double(numPrimsTotal); } + double fillRate (BVH* bvh) const { return fillRateNom(bvh)/fillRateDen(bvh); } + + __forceinline friend LeafStat operator+ ( const LeafStat& a, const LeafStat& b) + { + LeafStat stat(a.leafSAH + b.leafSAH, + a.numLeaves+b.numLeaves, + a.numPrimsActive+b.numPrimsActive, + a.numPrimsTotal+b.numPrimsTotal, + a.numPrimBlocks+b.numPrimBlocks, + a.numBytes+b.numBytes); + for (size_t i=0; i statAABBNodes = NodeStat(), + NodeStat statOBBNodes = NodeStat(), + NodeStat statAABBNodesMB = NodeStat(), + NodeStat statAABBNodesMB4D = NodeStat(), + NodeStat statOBBNodesMB = NodeStat(), + NodeStat statQuantizedNodes = NodeStat()) + + : depth(depth), + statLeaf(statLeaf), + statAABBNodes(statAABBNodes), + statOBBNodes(statOBBNodes), + statAABBNodesMB(statAABBNodesMB), + statAABBNodesMB4D(statAABBNodesMB4D), + statOBBNodesMB(statOBBNodesMB), + statQuantizedNodes(statQuantizedNodes) {} + + double sah(BVH* bvh) const + { + return statLeaf.sah(bvh) + + statAABBNodes.sah(bvh) + + statOBBNodes.sah(bvh) + + statAABBNodesMB.sah(bvh) + + statAABBNodesMB4D.sah(bvh) + + statOBBNodesMB.sah(bvh) + + statQuantizedNodes.sah(bvh); + } + + size_t bytes(BVH* bvh) const { + return statLeaf.bytes(bvh) + + statAABBNodes.bytes() + + statOBBNodes.bytes() + + statAABBNodesMB.bytes() + + statAABBNodesMB4D.bytes() + + statOBBNodesMB.bytes() + + statQuantizedNodes.bytes(); + } + + size_t size() const + { + return statLeaf.size() + + statAABBNodes.size() + + statOBBNodes.size() + + statAABBNodesMB.size() + + statAABBNodesMB4D.size() + + statOBBNodesMB.size() + + statQuantizedNodes.size(); + } + + double fillRate (BVH* bvh) const + { + double nom = statLeaf.fillRateNom(bvh) + + statAABBNodes.fillRateNom() + + statOBBNodes.fillRateNom() + + statAABBNodesMB.fillRateNom() + + statAABBNodesMB4D.fillRateNom() + + statOBBNodesMB.fillRateNom() + + statQuantizedNodes.fillRateNom(); + double den = statLeaf.fillRateDen(bvh) + + statAABBNodes.fillRateDen() + + statOBBNodes.fillRateDen() + + statAABBNodesMB.fillRateDen() + + statAABBNodesMB4D.fillRateDen() + + statOBBNodesMB.fillRateDen() + + statQuantizedNodes.fillRateDen(); + return nom/den; + } + + friend Statistics operator+ ( const Statistics& a, const Statistics& b ) + { + return Statistics(max(a.depth,b.depth), + a.statLeaf + b.statLeaf, + a.statAABBNodes + b.statAABBNodes, + a.statOBBNodes + b.statOBBNodes, + a.statAABBNodesMB + b.statAABBNodesMB, + a.statAABBNodesMB4D + b.statAABBNodesMB4D, + a.statOBBNodesMB + b.statOBBNodesMB, + a.statQuantizedNodes + b.statQuantizedNodes); + } + + static Statistics add ( const Statistics& a, const Statistics& b ) { + return a+b; + } + + public: + size_t depth; + LeafStat statLeaf; + NodeStat statAABBNodes; + NodeStat statOBBNodes; + NodeStat statAABBNodesMB; + NodeStat statAABBNodesMB4D; + NodeStat statOBBNodesMB; + NodeStat statQuantizedNodes; + }; + + public: + + /* Constructor gathers statistics. */ + BVHNStatistics (BVH* bvh); + + /*! Convert statistics into a string */ + std::string str(); + + double sah() const { + return stat.sah(bvh); + } + + size_t bytesUsed() const { + return stat.bytes(bvh); + } + + private: + Statistics statistics(NodeRef node, const double A, const BBox1f dt); + + private: + BVH* bvh; + Statistics stat; + }; + + typedef BVHNStatistics<4> BVH4Statistics; + typedef BVHNStatistics<8> BVH8Statistics; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h new file mode 100644 index 00000000000..7f17084b81d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h @@ -0,0 +1,676 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "node_intersector1.h" +#include "../common/stack_item.h" + +#define NEW_SORTING_CODE 1 + +namespace embree +{ + namespace isa + { + /*! BVH regular node traversal for single rays. */ + template + class BVHNNodeTraverser1Hit; + + /*! Helper functions for fast sorting using AVX512 instructions. */ +#if defined(__AVX512ER__) + + /* KNL code path */ + __forceinline void isort_update(vfloat16 &dist, vllong8 &ptr, const vfloat16 &d, const vllong8 &p) + { + const vfloat16 dist_shift = align_shift_right<15>(dist,dist); + const vllong8 ptr_shift = align_shift_right<7>(ptr,ptr); + const vbool16 m_geq = d >= dist; + const vbool16 m_geq_shift = m_geq << 1; + dist = select(m_geq,d,dist); + ptr = select(vboold8(m_geq),p,ptr); + dist = select(m_geq_shift,dist_shift,dist); + ptr = select(vboold8(m_geq_shift),ptr_shift,ptr); + } + + __forceinline void isort_quick_update(vfloat16 &dist, vllong8 &ptr, const vfloat16 &d, const vllong8 &p) + { + //dist = align_shift_right<15>(dist,d); + //ptr = align_shift_right<7>(ptr,p); + dist = align_shift_right<15>(dist,permute(d,vint16(zero))); + ptr = align_shift_right<7>(ptr,permute(p,vllong8(zero))); + } + + template + __forceinline void traverseClosestHitAVX512(NodeRef& cur, + size_t mask, + const vfloat& tNear, + StackItemT*& stackPtr, + StackItemT* stackEnd) + { + assert(mask != 0); + const BaseNode* node = cur.baseNode(); + + vllong8 children( vllong::loadu((void*)node->children) ); + children = vllong8::compact((int)mask,children); + vfloat16 distance = tNear; + distance = vfloat16::compact((int)mask,distance,tNear); + + cur = toScalar(children); + BVHN::prefetch(cur,types); + + mask &= mask-1; + if (likely(mask == 0)) return; + + /* 2 hits: order A0 B0 */ + const vllong8 c0(children); + const vfloat16 d0(distance); + children = align_shift_right<1>(children,children); + distance = align_shift_right<1>(distance,distance); + const vllong8 c1(children); + const vfloat16 d1(distance); + + cur = toScalar(children); + BVHN::prefetch(cur,types); + + /* a '<' keeps the order for equal distances, scenes like powerplant largely benefit from it */ + const vboolf16 m_dist = d0 < d1; + const vfloat16 dist_A0 = select(m_dist, d0, d1); + const vfloat16 dist_B0 = select(m_dist, d1, d0); + const vllong8 ptr_A0 = select(vboold8(m_dist), c0, c1); + const vllong8 ptr_B0 = select(vboold8(m_dist), c1, c0); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = toScalar(ptr_A0); + stackPtr[0].ptr = toScalar(ptr_B0); + *(float*)&stackPtr[0].dist = toScalar(dist_B0); + stackPtr++; + return; + } + + /* 3 hits: order A1 B1 C1 */ + + children = align_shift_right<1>(children,children); + distance = align_shift_right<1>(distance,distance); + + const vllong8 c2(children); + const vfloat16 d2(distance); + + cur = toScalar(children); + BVHN::prefetch(cur,types); + + const vboolf16 m_dist1 = dist_A0 <= d2; + const vfloat16 dist_tmp_B1 = select(m_dist1, d2, dist_A0); + const vllong8 ptr_A1 = select(vboold8(m_dist1), ptr_A0, c2); + const vllong8 ptr_tmp_B1 = select(vboold8(m_dist1), c2, ptr_A0); + + const vboolf16 m_dist2 = dist_B0 <= dist_tmp_B1; + const vfloat16 dist_B1 = select(m_dist2, dist_B0 , dist_tmp_B1); + const vfloat16 dist_C1 = select(m_dist2, dist_tmp_B1, dist_B0); + const vllong8 ptr_B1 = select(vboold8(m_dist2), ptr_B0, ptr_tmp_B1); + const vllong8 ptr_C1 = select(vboold8(m_dist2), ptr_tmp_B1, ptr_B0); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = toScalar(ptr_A1); + stackPtr[0].ptr = toScalar(ptr_C1); + *(float*)&stackPtr[0].dist = toScalar(dist_C1); + stackPtr[1].ptr = toScalar(ptr_B1); + *(float*)&stackPtr[1].dist = toScalar(dist_B1); + stackPtr+=2; + return; + } + + /* 4 hits: order A2 B2 C2 D2 */ + + const vfloat16 dist_A1 = select(m_dist1, dist_A0, d2); + + children = align_shift_right<1>(children,children); + distance = align_shift_right<1>(distance,distance); + + const vllong8 c3(children); + const vfloat16 d3(distance); + + cur = toScalar(children); + BVHN::prefetch(cur,types); + + const vboolf16 m_dist3 = dist_A1 <= d3; + const vfloat16 dist_tmp_B2 = select(m_dist3, d3, dist_A1); + const vllong8 ptr_A2 = select(vboold8(m_dist3), ptr_A1, c3); + const vllong8 ptr_tmp_B2 = select(vboold8(m_dist3), c3, ptr_A1); + + const vboolf16 m_dist4 = dist_B1 <= dist_tmp_B2; + const vfloat16 dist_B2 = select(m_dist4, dist_B1 , dist_tmp_B2); + const vfloat16 dist_tmp_C2 = select(m_dist4, dist_tmp_B2, dist_B1); + const vllong8 ptr_B2 = select(vboold8(m_dist4), ptr_B1, ptr_tmp_B2); + const vllong8 ptr_tmp_C2 = select(vboold8(m_dist4), ptr_tmp_B2, ptr_B1); + + const vboolf16 m_dist5 = dist_C1 <= dist_tmp_C2; + const vfloat16 dist_C2 = select(m_dist5, dist_C1 , dist_tmp_C2); + const vfloat16 dist_D2 = select(m_dist5, dist_tmp_C2, dist_C1); + const vllong8 ptr_C2 = select(vboold8(m_dist5), ptr_C1, ptr_tmp_C2); + const vllong8 ptr_D2 = select(vboold8(m_dist5), ptr_tmp_C2, ptr_C1); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = toScalar(ptr_A2); + stackPtr[0].ptr = toScalar(ptr_D2); + *(float*)&stackPtr[0].dist = toScalar(dist_D2); + stackPtr[1].ptr = toScalar(ptr_C2); + *(float*)&stackPtr[1].dist = toScalar(dist_C2); + stackPtr[2].ptr = toScalar(ptr_B2); + *(float*)&stackPtr[2].dist = toScalar(dist_B2); + stackPtr+=3; + return; + } + + /* >=5 hits: reverse to descending order for writing to stack */ + + const size_t hits = 4 + popcnt(mask); + const vfloat16 dist_A2 = select(m_dist3, dist_A1, d3); + vfloat16 dist(neg_inf); + vllong8 ptr(zero); + + + isort_quick_update(dist,ptr,dist_A2,ptr_A2); + isort_quick_update(dist,ptr,dist_B2,ptr_B2); + isort_quick_update(dist,ptr,dist_C2,ptr_C2); + isort_quick_update(dist,ptr,dist_D2,ptr_D2); + + do { + + children = align_shift_right<1>(children,children); + distance = align_shift_right<1>(distance,distance); + + cur = toScalar(children); + BVHN::prefetch(cur,types); + + const vfloat16 new_dist(permute(distance,vint16(zero))); + const vllong8 new_ptr(permute(children,vllong8(zero))); + + mask &= mask-1; + isort_update(dist,ptr,new_dist,new_ptr); + + } while(mask); + + const vboold8 m_stack_ptr(0x55); // 10101010 (lsb -> msb) + const vboolf16 m_stack_dist(0x4444); // 0010001000100010 (lsb -> msb) + + /* extract current noderef */ + cur = toScalar(permute(ptr,vllong8(hits-1))); + /* rearrange pointers to beginning of 16 bytes block */ + vllong8 stackElementA0; + stackElementA0 = vllong8::expand(m_stack_ptr,ptr,stackElementA0); + /* put distances in between */ + vuint16 stackElementA1((__m512i)stackElementA0); + stackElementA1 = vuint16::expand(m_stack_dist,asUInt(dist),stackElementA1); + /* write out first 4 x 16 bytes block to stack */ + vuint16::storeu(stackPtr,stackElementA1); + /* get upper half of dist and ptr */ + dist = align_shift_right<4>(dist,dist); + ptr = align_shift_right<4>(ptr,ptr); + /* assemble and write out second block */ + vllong8 stackElementB0; + stackElementB0 = vllong8::expand(m_stack_ptr,ptr,stackElementB0); + vuint16 stackElementB1((__m512i)stackElementB0); + stackElementB1 = vuint16::expand(m_stack_dist,asUInt(dist),stackElementB1); + vuint16::storeu(stackPtr + 4,stackElementB1); + /* increase stack pointer */ + stackPtr += hits-1; + } +#endif + +#if defined(__AVX512VL__) // SKX + + template + __forceinline void isort_update(vint &dist, const vint &d) + { + const vint dist_shift = align_shift_right(dist,dist); + const vboolf m_geq = d >= dist; + const vboolf m_geq_shift = m_geq << 1; + dist = select(m_geq,d,dist); + dist = select(m_geq_shift,dist_shift,dist); + } + + template + __forceinline void isort_quick_update(vint &dist, const vint &d) { + dist = align_shift_right(dist,permute(d,vint(zero))); + } + + __forceinline size_t permuteExtract(const vint8& index, const vllong4& n0, const vllong4& n1) { + return toScalar(permutex2var((__m256i)index,n0,n1)); + } + + __forceinline float permuteExtract(const vint8& index, const vfloat8& n) { + return toScalar(permute(n,index)); + } + +#endif + + /* Specialization for BVH4. */ + template + class BVHNNodeTraverser1Hit<4, Nx, types> + { + typedef BVH4 BVH; + typedef BVH4::NodeRef NodeRef; + typedef BVH4::BaseNode BaseNode; + + + public: + /* Traverses a node with at least one hit child. Optimized for finding the closest hit (intersection). */ + static __forceinline void traverseClosestHit(NodeRef& cur, + size_t mask, + const vfloat& tNear, + StackItemT*& stackPtr, + StackItemT* stackEnd) + { + assert(mask != 0); +#if defined(__AVX512ER__) + traverseClosestHitAVX512<4,Nx,types,NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd); +#else + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVH::prefetch(cur,types); + if (likely(mask == 0)) { + assert(cur != BVH::emptyNode); + return; + } + + /*! two children are hit, push far child, and continue with closer child */ + NodeRef c0 = cur; + const unsigned int d0 = ((unsigned int*)&tNear)[r]; + r = bscf(mask); + NodeRef c1 = node->child(r); + BVH::prefetch(c1,types); + const unsigned int d1 = ((unsigned int*)&tNear)[r]; + assert(c0 != BVH::emptyNode); + assert(c1 != BVH::emptyNode); + if (likely(mask == 0)) { + assert(stackPtr < stackEnd); + if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; } + else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; } + } + +#if NEW_SORTING_CODE == 1 + vint4 s0((size_t)c0,(size_t)d0); + vint4 s1((size_t)c1,(size_t)d1); + r = bscf(mask); + NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; + vint4 s2((size_t)c2,(size_t)d2); + /* 3 hits */ + if (likely(mask == 0)) { + StackItemT::sort3(s0,s1,s2); + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; + cur = toSizeT(s2); + stackPtr+=2; + return; + } + r = bscf(mask); + NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; + vint4 s3((size_t)c3,(size_t)d3); + /* 4 hits */ + StackItemT::sort4(s0,s1,s2,s3); + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; + cur = toSizeT(s3); + stackPtr+=3; +#else + /*! Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. */ + assert(stackPtr < stackEnd); + stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; + assert(stackPtr < stackEnd); + stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; + + /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */ + assert(stackPtr < stackEnd); + r = bscf(mask); + NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + if (likely(mask == 0)) { + sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; + return; + } + + /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */ + assert(stackPtr < stackEnd); + r = bscf(mask); + c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; +#endif +#endif + } + + /* Traverses a node with at least one hit child. Optimized for finding any hit (occlusion). */ + static __forceinline void traverseAnyHit(NodeRef& cur, + size_t mask, + const vfloat& tNear, + NodeRef*& stackPtr, + NodeRef* stackEnd) + { + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVH::prefetch(cur,types); + + /* simpler in sequence traversal order */ + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + assert(stackPtr < stackEnd); + *stackPtr = cur; stackPtr++; + + for (; ;) + { + r = bscf(mask); + cur = node->child(r); BVH::prefetch(cur,types); + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + assert(stackPtr < stackEnd); + *stackPtr = cur; stackPtr++; + } + } + }; + + /* Specialization for BVH8. */ + template + class BVHNNodeTraverser1Hit<8, Nx, types> + { + typedef BVH8 BVH; + typedef BVH8::NodeRef NodeRef; + typedef BVH8::BaseNode BaseNode; + +#if defined(__AVX512VL__) + template + static __forceinline void traverseClosestHitAVX512VL8(NodeRef& cur, + size_t mask, + const vfloat8& tNear, + StackItemT*& stackPtr, + StackItemT* stackEnd) + { + assert(mask != 0); + const BaseNode* node = cur.baseNode(); + const vllong4 n0 = vllong4::loadu((vllong4*)&node->children[0]); + const vllong4 n1 = vllong4::loadu((vllong4*)&node->children[4]); + vint8 distance_i = (asInt(tNear) & 0xfffffff8) | vint8(step); + distance_i = vint8::compact((int)mask,distance_i,distance_i); + cur = permuteExtract(distance_i,n0,n1); + BVH::prefetch(cur,types); + + mask &= mask-1; + if (likely(mask == 0)) return; + + /* 2 hits: order A0 B0 */ + const vint8 d0(distance_i); + const vint8 d1(shuffle<1>(distance_i)); + cur = permuteExtract(d1,n0,n1); + BVH::prefetch(cur,types); + + const vint8 dist_A0 = min(d0, d1); + const vint8 dist_B0 = max(d0, d1); + assert(dist_A0[0] < dist_B0[0]); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = permuteExtract(dist_A0,n0,n1); + stackPtr[0].ptr = permuteExtract(dist_B0,n0,n1); + *(float*)&stackPtr[0].dist = permuteExtract(dist_B0,tNear); + stackPtr++; + return; + } + + /* 3 hits: order A1 B1 C1 */ + + const vint8 d2(shuffle<2>(distance_i)); + cur = permuteExtract(d2,n0,n1); + BVH::prefetch(cur,types); + + const vint8 dist_A1 = min(dist_A0,d2); + const vint8 dist_tmp_B1 = max(dist_A0,d2); + const vint8 dist_B1 = min(dist_B0,dist_tmp_B1); + const vint8 dist_C1 = max(dist_B0,dist_tmp_B1); + assert(dist_A1[0] < dist_B1[0]); + assert(dist_B1[0] < dist_C1[0]); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = permuteExtract(dist_A1,n0,n1); + stackPtr[0].ptr = permuteExtract(dist_C1,n0,n1); + *(float*)&stackPtr[0].dist = permuteExtract(dist_C1,tNear); + stackPtr[1].ptr = permuteExtract(dist_B1,n0,n1); + *(float*)&stackPtr[1].dist = permuteExtract(dist_B1,tNear); + stackPtr+=2; + return; + } + + /* 4 hits: order A2 B2 C2 D2 */ + + const vint8 d3(shuffle<3>(distance_i)); + cur = permuteExtract(d3,n0,n1); + BVH::prefetch(cur,types); + + const vint8 dist_A2 = min(dist_A1,d3); + const vint8 dist_tmp_B2 = max(dist_A1,d3); + const vint8 dist_B2 = min(dist_B1,dist_tmp_B2); + const vint8 dist_tmp_C2 = max(dist_B1,dist_tmp_B2); + const vint8 dist_C2 = min(dist_C1,dist_tmp_C2); + const vint8 dist_D2 = max(dist_C1,dist_tmp_C2); + assert(dist_A2[0] < dist_B2[0]); + assert(dist_B2[0] < dist_C2[0]); + assert(dist_C2[0] < dist_D2[0]); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = permuteExtract(dist_A2,n0,n1); + stackPtr[0].ptr = permuteExtract(dist_D2,n0,n1); + *(float*)&stackPtr[0].dist = permuteExtract(dist_D2,tNear); + stackPtr[1].ptr = permuteExtract(dist_C2,n0,n1); + *(float*)&stackPtr[1].dist = permuteExtract(dist_C2,tNear); + stackPtr[2].ptr = permuteExtract(dist_B2,n0,n1); + *(float*)&stackPtr[2].dist = permuteExtract(dist_B2,tNear); + stackPtr+=3; + return; + } + + /* >=5 hits: reverse to descending order for writing to stack */ + + distance_i = align_shift_right<3>(distance_i,distance_i); + const size_t hits = 4 + popcnt(mask); + vint8 dist(INT_MIN); // this will work with -0.0f (0x80000000) as distance, isort_update uses >= to insert + + isort_quick_update(dist,dist_A2); + isort_quick_update(dist,dist_B2); + isort_quick_update(dist,dist_C2); + isort_quick_update(dist,dist_D2); + + do { + + distance_i = align_shift_right<1>(distance_i,distance_i); + cur = permuteExtract(distance_i,n0,n1); + BVH::prefetch(cur,types); + const vint8 new_dist(permute(distance_i,vint8(zero))); + mask &= mask-1; + isort_update(dist,new_dist); + + } while(mask); + + for (size_t i=0; i<7; i++) + assert(dist[i+0]>=dist[i+1]); + + for (size_t i=0;iptr = permuteExtract(dist,n0,n1); + *(float*)&stackPtr->dist = permuteExtract(dist,tNear); + dist = align_shift_right<1>(dist,dist); + stackPtr++; + } + cur = permuteExtract(dist,n0,n1); + } +#endif + + public: + static __forceinline void traverseClosestHit(NodeRef& cur, + size_t mask, + const vfloat& tNear, + StackItemT*& stackPtr, + StackItemT* stackEnd) + { + assert(mask != 0); +#if defined(__AVX512ER__) + traverseClosestHitAVX512<8,Nx,types,NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd); +#elif defined(__AVX512VL__) + traverseClosestHitAVX512VL8(cur,mask,tNear,stackPtr,stackEnd); +#else + + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVH::prefetch(cur,types); + if (likely(mask == 0)) { + assert(cur != BVH::emptyNode); + return; + } + + /*! two children are hit, push far child, and continue with closer child */ + NodeRef c0 = cur; + const unsigned int d0 = ((unsigned int*)&tNear)[r]; + r = bscf(mask); + NodeRef c1 = node->child(r); + BVH::prefetch(c1,types); + const unsigned int d1 = ((unsigned int*)&tNear)[r]; + + assert(c0 != BVH::emptyNode); + assert(c1 != BVH::emptyNode); + if (likely(mask == 0)) { + assert(stackPtr < stackEnd); + if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; } + else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; } + } +#if NEW_SORTING_CODE == 1 + vint4 s0((size_t)c0,(size_t)d0); + vint4 s1((size_t)c1,(size_t)d1); + + r = bscf(mask); + NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; + vint4 s2((size_t)c2,(size_t)d2); + /* 3 hits */ + if (likely(mask == 0)) { + StackItemT::sort3(s0,s1,s2); + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; + cur = toSizeT(s2); + stackPtr+=2; + return; + } + r = bscf(mask); + NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; + vint4 s3((size_t)c3,(size_t)d3); + /* 4 hits */ + if (likely(mask == 0)) { + StackItemT::sort4(s0,s1,s2,s3); + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; + cur = toSizeT(s3); + stackPtr+=3; + return; + } + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; *(vint4*)&stackPtr[3] = s3; + /*! fallback case if more than 4 children are hit */ + StackItemT* stackFirst = stackPtr; + stackPtr+=4; + while (1) + { + assert(stackPtr < stackEnd); + r = bscf(mask); + NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = *(unsigned int*)&tNear[r]; + const vint4 s((size_t)c,(size_t)d); + *(vint4*)stackPtr++ = s; + assert(c != BVH::emptyNode); + if (unlikely(mask == 0)) break; + } + sort(stackFirst,stackPtr); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; +#else + /*! Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. */ + assert(stackPtr < stackEnd); + stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; + assert(stackPtr < stackEnd); + stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; + + /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */ + assert(stackPtr < stackEnd); + r = bscf(mask); + NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + if (likely(mask == 0)) { + sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; + return; + } + + /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */ + assert(stackPtr < stackEnd); + r = bscf(mask); + c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + if (likely(mask == 0)) { + sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; + return; + } + /*! fallback case if more than 4 children are hit */ + StackItemT* stackFirst = stackPtr-4; + while (1) + { + assert(stackPtr < stackEnd); + r = bscf(mask); + c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + if (unlikely(mask == 0)) break; + } + sort(stackFirst,stackPtr); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; +#endif +#endif + } + + static __forceinline void traverseAnyHit(NodeRef& cur, + size_t mask, + const vfloat& tNear, + NodeRef*& stackPtr, + NodeRef* stackEnd) + { + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVH::prefetch(cur,types); + + /* simpler in sequence traversal order */ + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + assert(stackPtr < stackEnd); + *stackPtr = cur; stackPtr++; + + for (; ;) + { + r = bscf(mask); + cur = node->child(r); BVH::prefetch(cur,types); + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + assert(stackPtr < stackEnd); + *stackPtr = cur; stackPtr++; + } + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h new file mode 100644 index 00000000000..9c603babf0a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h @@ -0,0 +1,154 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../common/ray.h" +#include "../common/stack_item.h" + +namespace embree +{ + namespace isa + { + template + class BVHNNodeTraverserStreamHitCoherent + { + typedef BVHN BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::BaseNode BaseNode; + + public: + template + static __forceinline void traverseClosestHit(NodeRef& cur, + size_t& m_trav_active, + const vbool& vmask, + const vfloat& tNear, + const T* const tMask, + StackItemMaskCoherent*& stackPtr) + { + const NodeRef parent = cur; + size_t mask = movemask(vmask); + assert(mask != 0); + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + const size_t r0 = bscf(mask); + assert(r0 < 8); + cur = node->child(r0); + BVHN::prefetch(cur,types); + m_trav_active = tMask[r0]; + assert(cur != BVH::emptyNode); + if (unlikely(mask == 0)) return; + + const unsigned int* const tNear_i = (unsigned int*)&tNear; + + /*! two children are hit, push far child, and continue with closer child */ + NodeRef c0 = cur; + unsigned int d0 = tNear_i[r0]; + const size_t r1 = bscf(mask); + assert(r1 < 8); + NodeRef c1 = node->child(r1); + BVHN::prefetch(c1,types); + unsigned int d1 = tNear_i[r1]; + + assert(c0 != BVH::emptyNode); + assert(c1 != BVH::emptyNode); + if (likely(mask == 0)) { + if (d0 < d1) { + assert(tNear[r1] >= 0.0f); + stackPtr->mask = tMask[r1]; + stackPtr->parent = parent; + stackPtr->child = c1; + stackPtr++; + cur = c0; + m_trav_active = tMask[r0]; + return; + } + else { + assert(tNear[r0] >= 0.0f); + stackPtr->mask = tMask[r0]; + stackPtr->parent = parent; + stackPtr->child = c0; + stackPtr++; + cur = c1; + m_trav_active = tMask[r1]; + return; + } + } + + /*! slow path for more than two hits */ + size_t hits = movemask(vmask); + const vint dist_i = select(vmask, (asInt(tNear) & 0xfffffff8) | vint(step), 0); + #if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL + const vint tmp = extractN(dist_i); + const vint dist_i_sorted = usort_descending(tmp); + #else + const vint dist_i_sorted = usort_descending(dist_i); + #endif + const vint sorted_index = dist_i_sorted & 7; + + size_t i = 0; + for (;;) + { + const unsigned int index = sorted_index[i]; + assert(index < 8); + cur = node->child(index); + m_trav_active = tMask[index]; + assert(m_trav_active); + BVHN::prefetch(cur,types); + bscf(hits); + if (unlikely(hits==0)) break; + i++; + assert(cur != BVH::emptyNode); + assert(tNear[index] >= 0.0f); + stackPtr->mask = m_trav_active; + stackPtr->parent = parent; + stackPtr->child = cur; + stackPtr++; + } + } + + template + static __forceinline void traverseAnyHit(NodeRef& cur, + size_t& m_trav_active, + const vbool& vmask, + const T* const tMask, + StackItemMaskCoherent*& stackPtr) + { + const NodeRef parent = cur; + size_t mask = movemask(vmask); + assert(mask != 0); + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVHN::prefetch(cur,types); + m_trav_active = tMask[r]; + + /* simple in order sequence */ + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + stackPtr->mask = m_trav_active; + stackPtr->parent = parent; + stackPtr->child = cur; + stackPtr++; + + for (; ;) + { + r = bscf(mask); + cur = node->child(r); + BVHN::prefetch(cur,types); + m_trav_active = tMask[r]; + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + stackPtr->mask = m_trav_active; + stackPtr->parent = parent; + stackPtr->child = cur; + stackPtr++; + } + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h new file mode 100644 index 00000000000..a978c0c4590 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h @@ -0,0 +1,31 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" + +namespace embree +{ + namespace isa + { + struct NearFarPrecalculations + { + size_t nearX, nearY, nearZ; + size_t farX, farY, farZ; + + __forceinline NearFarPrecalculations() {} + + __forceinline NearFarPrecalculations(const Vec3fa& dir, size_t N) + { + const size_t size = sizeof(float)*N; + nearX = (dir.x < 0.0f) ? 1*size : 0*size; + nearY = (dir.y < 0.0f) ? 3*size : 2*size; + nearZ = (dir.z < 0.0f) ? 5*size : 4*size; + farX = nearX ^ size; + farY = nearY ^ size; + farZ = nearZ ^ size; + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h new file mode 100644 index 00000000000..aa0d4ba4d74 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h @@ -0,0 +1,1788 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector.h" + +#if defined(__AVX2__) +#define __FMA_X4__ +#endif + +#if defined(__aarch64__) +#define __FMA_X4__ +#endif + + +namespace embree +{ + namespace isa + { + ////////////////////////////////////////////////////////////////////////////////////// + // Ray structure used in single-ray traversal + ////////////////////////////////////////////////////////////////////////////////////// + + template + struct TravRayBase; + + /* Base (without tnear and tfar) */ + template + struct TravRayBase + { + __forceinline TravRayBase() {} + + __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir) + : org_xyz(ray_org), dir_xyz(ray_dir) + { + const Vec3fa ray_rdir = rcp_safe(ray_dir); + org = Vec3vf(ray_org.x,ray_org.y,ray_org.z); + dir = Vec3vf(ray_dir.x,ray_dir.y,ray_dir.z); + rdir = Vec3vf(ray_rdir.x,ray_rdir.y,ray_rdir.z); +#if defined(__FMA_X4__) + const Vec3fa ray_org_rdir = ray_org*ray_rdir; +#if !defined(__aarch64__) + org_rdir = Vec3vf(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z); +#else + //for aarch64, we do not have msub equal instruction, so we negeate orig and use madd + //x86 will use msub + neg_org_rdir = Vec3vf(-ray_org_rdir.x,-ray_org_rdir.y,-ray_org_rdir.z); +#endif +#endif + nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat) : 1*sizeof(vfloat); + nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat) : 3*sizeof(vfloat); + nearZ = ray_rdir.z >= 0.0f ? 4*sizeof(vfloat) : 5*sizeof(vfloat); + farX = nearX ^ sizeof(vfloat); + farY = nearY ^ sizeof(vfloat); + farZ = nearZ ^ sizeof(vfloat); + +#if defined(__AVX512ER__) // KNL+ + /* optimization works only for 8-wide BVHs with 16-wide SIMD */ + const vint<16> id(step); + const vint<16> id2 = align_shift_right<16/2>(id, id); + permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2); + permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2); + permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2); +#endif + + } + + template + __forceinline TravRayBase(size_t k, const Vec3vf& ray_org, const Vec3vf& ray_dir, + const Vec3vf& ray_rdir, const Vec3vi& nearXYZ, + size_t flip = sizeof(vfloat)) + { + org = Vec3vf(ray_org.x[k], ray_org.y[k], ray_org.z[k]); + dir = Vec3vf(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]); + rdir = Vec3vf(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); +#if defined(__FMA_X4__) +#if !defined(__aarch64__) + org_rdir = org*rdir; +#else + neg_org_rdir = -(org*rdir); +#endif +#endif + nearX = nearXYZ.x[k]; + nearY = nearXYZ.y[k]; + nearZ = nearXYZ.z[k]; + farX = nearX ^ flip; + farY = nearY ^ flip; + farZ = nearZ ^ flip; + +#if defined(__AVX512ER__) // KNL+ + /* optimization works only for 8-wide BVHs with 16-wide SIMD */ + const vint<16> id(step); + const vint<16> id2 = align_shift_right<16/2>(id, id); + permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2); + permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2); + permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2); +#endif + } + + Vec3fa org_xyz, dir_xyz; + Vec3vf org, dir, rdir; +#if defined(__FMA_X4__) +#if !defined(__aarch64__) + Vec3vf org_rdir; +#else + //aarch64 version are keeping negation of the org_rdir and use madd + //x86 uses msub + Vec3vf neg_org_rdir; +#endif +#endif +#if defined(__AVX512ER__) // KNL+ + vint16 permX, permY, permZ; +#endif + + size_t nearX, nearY, nearZ; + size_t farX, farY, farZ; + }; + + /* Base (without tnear and tfar) */ + template + struct TravRayBase + { + __forceinline TravRayBase() {} + + __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir) + : org_xyz(ray_org), dir_xyz(ray_dir) + { + const float round_down = 1.0f-3.0f*float(ulp); + const float round_up = 1.0f+3.0f*float(ulp); + const Vec3fa ray_rdir = 1.0f/zero_fix(ray_dir); + const Vec3fa ray_rdir_near = round_down*ray_rdir; + const Vec3fa ray_rdir_far = round_up *ray_rdir; + org = Vec3vf(ray_org.x,ray_org.y,ray_org.z); + dir = Vec3vf(ray_dir.x,ray_dir.y,ray_dir.z); + rdir_near = Vec3vf(ray_rdir_near.x,ray_rdir_near.y,ray_rdir_near.z); + rdir_far = Vec3vf(ray_rdir_far .x,ray_rdir_far .y,ray_rdir_far .z); + nearX = ray_rdir_near.x >= 0.0f ? 0*sizeof(vfloat) : 1*sizeof(vfloat); + nearY = ray_rdir_near.y >= 0.0f ? 2*sizeof(vfloat) : 3*sizeof(vfloat); + nearZ = ray_rdir_near.z >= 0.0f ? 4*sizeof(vfloat) : 5*sizeof(vfloat); + farX = nearX ^ sizeof(vfloat); + farY = nearY ^ sizeof(vfloat); + farZ = nearZ ^ sizeof(vfloat); + +#if defined(__AVX512ER__) // KNL+ + /* optimization works only for 8-wide BVHs with 16-wide SIMD */ + const vint<16> id(step); + const vint<16> id2 = align_shift_right<16/2>(id, id); + permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2); + permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2); + permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2); +#endif + } + + template + __forceinline TravRayBase(size_t k, const Vec3vf& ray_org, const Vec3vf& ray_dir, + const Vec3vf& ray_rdir, const Vec3vi& nearXYZ, + size_t flip = sizeof(vfloat)) + { + const vfloat round_down = 1.0f-3.0f*float(ulp); + const vfloat round_up = 1.0f+3.0f*float(ulp); + org = Vec3vf(ray_org.x[k], ray_org.y[k], ray_org.z[k]); + dir = Vec3vf(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]); + rdir_near = round_down*Vec3vf(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); + rdir_far = round_up *Vec3vf(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); + + nearX = nearXYZ.x[k]; + nearY = nearXYZ.y[k]; + nearZ = nearXYZ.z[k]; + farX = nearX ^ flip; + farY = nearY ^ flip; + farZ = nearZ ^ flip; + +#if defined(__AVX512ER__) // KNL+ + /* optimization works only for 8-wide BVHs with 16-wide SIMD */ + const vint<16> id(step); + const vint<16> id2 = align_shift_right<16/2>(id, id); + permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2); + permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2); + permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2); +#endif + } + + Vec3fa org_xyz, dir_xyz; + Vec3vf org, dir, rdir_near, rdir_far; +#if defined(__AVX512ER__) // KNL+ + vint16 permX, permY, permZ; +#endif + + size_t nearX, nearY, nearZ; + size_t farX, farY, farZ; + }; + + /* Full (with tnear and tfar) */ + template + struct TravRay : TravRayBase + { + __forceinline TravRay() {} + + __forceinline TravRay(const Vec3fa& ray_org, const Vec3fa& ray_dir, float ray_tnear, float ray_tfar) + : TravRayBase(ray_org, ray_dir), + tnear(ray_tnear), tfar(ray_tfar) {} + + template + __forceinline TravRay(size_t k, const Vec3vf& ray_org, const Vec3vf& ray_dir, + const Vec3vf& ray_rdir, const Vec3vi& nearXYZ, + float ray_tnear, float ray_tfar, + size_t flip = sizeof(vfloat)) + : TravRayBase(k, ray_org, ray_dir, ray_rdir, nearXYZ, flip), + tnear(ray_tnear), tfar(ray_tfar) {} + + vfloat tnear; + vfloat tfar; + }; + + ////////////////////////////////////////////////////////////////////////////////////// + // Point Query structure used in single-ray traversal + ////////////////////////////////////////////////////////////////////////////////////// + + template + struct TravPointQuery + { + __forceinline TravPointQuery() {} + + __forceinline TravPointQuery(const Vec3fa& query_org, const Vec3fa& query_rad) + { + org = Vec3vf(query_org.x, query_org.y, query_org.z); + rad = Vec3vf(query_rad.x, query_rad.y, query_rad.z); + } + + __forceinline vfloat const& tfar() const { + return rad.x; + } + + Vec3vf org, rad; + }; + + ////////////////////////////////////////////////////////////////////////////////////// + // point query + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline size_t pointQuerySphereDistAndMask( + const TravPointQuery& query, vfloat& dist, vfloat const& minX, vfloat const& maxX, + vfloat const& minY, vfloat const& maxY, vfloat const& minZ, vfloat const& maxZ) + { + const vfloat vX = min(max(query.org.x, minX), maxX) - query.org.x; + const vfloat vY = min(max(query.org.y, minY), maxY) - query.org.y; + const vfloat vZ = min(max(query.org.z, minZ), maxZ) - query.org.z; + dist = vX * vX + vY * vY + vZ * vZ; + const vbool vmask = dist <= query.tfar()*query.tfar(); + const vbool valid = minX <= maxX; + return movemask(vmask) & movemask(valid); + } + + template + __forceinline size_t pointQueryNodeSphere(const typename BVHN::AABBNode* node, const TravPointQuery& query, vfloat& dist) + { + const vfloat minX = vfloat::load((float*)((const char*)&node->lower_x)); + const vfloat minY = vfloat::load((float*)((const char*)&node->lower_y)); + const vfloat minZ = vfloat::load((float*)((const char*)&node->lower_z)); + const vfloat maxX = vfloat::load((float*)((const char*)&node->upper_x)); + const vfloat maxY = vfloat::load((float*)((const char*)&node->upper_y)); + const vfloat maxZ = vfloat::load((float*)((const char*)&node->upper_z)); + return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); + } + + template + __forceinline size_t pointQueryNodeSphere(const typename BVHN::AABBNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) + { + const vfloat* pMinX = (const vfloat*)((const char*)&node->lower_x); + const vfloat* pMinY = (const vfloat*)((const char*)&node->lower_y); + const vfloat* pMinZ = (const vfloat*)((const char*)&node->lower_z); + const vfloat* pMaxX = (const vfloat*)((const char*)&node->upper_x); + const vfloat* pMaxY = (const vfloat*)((const char*)&node->upper_y); + const vfloat* pMaxZ = (const vfloat*)((const char*)&node->upper_z); + const vfloat minX = madd(time,pMinX[6],vfloat(pMinX[0])); + const vfloat minY = madd(time,pMinY[6],vfloat(pMinY[0])); + const vfloat minZ = madd(time,pMinZ[6],vfloat(pMinZ[0])); + const vfloat maxX = madd(time,pMaxX[6],vfloat(pMaxX[0])); + const vfloat maxY = madd(time,pMaxY[6],vfloat(pMaxY[0])); + const vfloat maxZ = madd(time,pMaxZ[6],vfloat(pMaxZ[0])); + return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); + } + + template + __forceinline size_t pointQueryNodeSphereMB4D(const typename BVHN::NodeRef ref, const TravPointQuery& query, const float time, vfloat& dist) + { + const typename BVHN::AABBNodeMB* node = ref.getAABBNodeMB(); + size_t mask = pointQueryNodeSphere(node, query, time, dist); + + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN::AABBNodeMB4D* node1 = (const typename BVHN::AABBNodeMB4D*) node; + const vbool vmask = (node1->lower_t <= time) & (time < node1->upper_t); + mask &= movemask(vmask); + } + + return mask; + } + + template + __forceinline size_t pointQueryNodeSphere(const typename BVHN::QuantizedBaseNode* node, const TravPointQuery& query, vfloat& dist) + { + const vfloat start_x(node->start.x); + const vfloat scale_x(node->scale.x); + const vfloat minX = madd(node->template dequantize((0*sizeof(vfloat)) >> 2),scale_x,start_x); + const vfloat maxX = madd(node->template dequantize((1*sizeof(vfloat)) >> 2),scale_x,start_x); + const vfloat start_y(node->start.y); + const vfloat scale_y(node->scale.y); + const vfloat minY = madd(node->template dequantize((2*sizeof(vfloat)) >> 2),scale_y,start_y); + const vfloat maxY = madd(node->template dequantize((3*sizeof(vfloat)) >> 2),scale_y,start_y); + const vfloat start_z(node->start.z); + const vfloat scale_z(node->scale.z); + const vfloat minZ = madd(node->template dequantize((4*sizeof(vfloat)) >> 2),scale_z,start_z); + const vfloat maxZ = madd(node->template dequantize((5*sizeof(vfloat)) >> 2),scale_z,start_z); + return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask()); + } + + template + __forceinline size_t pointQueryNodeSphere(const typename BVHN::QuantizedBaseNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) + { + const vfloat minX = node->dequantizeLowerX(time); + const vfloat maxX = node->dequantizeUpperX(time); + const vfloat minY = node->dequantizeLowerY(time); + const vfloat maxY = node->dequantizeUpperY(time); + const vfloat minZ = node->dequantizeLowerZ(time); + const vfloat maxZ = node->dequantizeUpperZ(time); + return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask()); + } + + template + __forceinline size_t pointQueryNodeSphere(const typename BVHN::OBBNode* node, const TravPointQuery& query, vfloat& dist) + { + // TODO: point query - implement + const vbool vmask = vbool(true); + const size_t mask = movemask(vmask) & ((1<(0.0f); + return mask; + } + + template + __forceinline size_t pointQueryNodeSphere(const typename BVHN::OBBNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) + { + // TODO: point query - implement + const vbool vmask = vbool(true); + const size_t mask = movemask(vmask) & ((1<(0.0f); + return mask; + } + + template + __forceinline size_t pointQueryAABBDistAndMask( + const TravPointQuery& query, vfloat& dist, vfloat const& minX, vfloat const& maxX, + vfloat const& minY, vfloat const& maxY, vfloat const& minZ, vfloat const& maxZ) + { + const vfloat vX = min(max(query.org.x, minX), maxX) - query.org.x; + const vfloat vY = min(max(query.org.y, minY), maxY) - query.org.y; + const vfloat vZ = min(max(query.org.z, minZ), maxZ) - query.org.z; + dist = vX * vX + vY * vY + vZ * vZ; + const vbool valid = minX <= maxX; + const vbool vmask = !((maxX < query.org.x - query.rad.x) | (minX > query.org.x + query.rad.x) | + (maxY < query.org.y - query.rad.y) | (minY > query.org.y + query.rad.y) | + (maxZ < query.org.z - query.rad.z) | (minZ > query.org.z + query.rad.z)); + return movemask(vmask) & movemask(valid); + } + + template + __forceinline size_t pointQueryNodeAABB(const typename BVHN::AABBNode* node, const TravPointQuery& query, vfloat& dist) + { + const vfloat minX = vfloat::load((float*)((const char*)&node->lower_x)); + const vfloat minY = vfloat::load((float*)((const char*)&node->lower_y)); + const vfloat minZ = vfloat::load((float*)((const char*)&node->lower_z)); + const vfloat maxX = vfloat::load((float*)((const char*)&node->upper_x)); + const vfloat maxY = vfloat::load((float*)((const char*)&node->upper_y)); + const vfloat maxZ = vfloat::load((float*)((const char*)&node->upper_z)); + return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); + } + + template + __forceinline size_t pointQueryNodeAABB(const typename BVHN::AABBNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) + { + const vfloat* pMinX = (const vfloat*)((const char*)&node->lower_x); + const vfloat* pMinY = (const vfloat*)((const char*)&node->lower_y); + const vfloat* pMinZ = (const vfloat*)((const char*)&node->lower_z); + const vfloat* pMaxX = (const vfloat*)((const char*)&node->upper_x); + const vfloat* pMaxY = (const vfloat*)((const char*)&node->upper_y); + const vfloat* pMaxZ = (const vfloat*)((const char*)&node->upper_z); + const vfloat minX = madd(time,pMinX[6],vfloat(pMinX[0])); + const vfloat minY = madd(time,pMinY[6],vfloat(pMinY[0])); + const vfloat minZ = madd(time,pMinZ[6],vfloat(pMinZ[0])); + const vfloat maxX = madd(time,pMaxX[6],vfloat(pMaxX[0])); + const vfloat maxY = madd(time,pMaxY[6],vfloat(pMaxY[0])); + const vfloat maxZ = madd(time,pMaxZ[6],vfloat(pMaxZ[0])); + return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); + } + + template + __forceinline size_t pointQueryNodeAABBMB4D(const typename BVHN::NodeRef ref, const TravPointQuery& query, const float time, vfloat& dist) + { + const typename BVHN::AABBNodeMB* node = ref.getAABBNodeMB(); + size_t mask = pointQueryNodeAABB(node, query, time, dist); + + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN::AABBNodeMB4D* node1 = (const typename BVHN::AABBNodeMB4D*) node; + const vbool vmask = (node1->lower_t <= time) & (time < node1->upper_t); + mask &= movemask(vmask); + } + + return mask; + } + + template + __forceinline size_t pointQueryNodeAABB(const typename BVHN::QuantizedBaseNode* node, const TravPointQuery& query, vfloat& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat start_x(node->start.x); + const vfloat scale_x(node->scale.x); + const vfloat minX = madd(node->template dequantize((0*sizeof(vfloat)) >> 2),scale_x,start_x); + const vfloat maxX = madd(node->template dequantize((1*sizeof(vfloat)) >> 2),scale_x,start_x); + const vfloat start_y(node->start.y); + const vfloat scale_y(node->scale.y); + const vfloat minY = madd(node->template dequantize((2*sizeof(vfloat)) >> 2),scale_y,start_y); + const vfloat maxY = madd(node->template dequantize((3*sizeof(vfloat)) >> 2),scale_y,start_y); + const vfloat start_z(node->start.z); + const vfloat scale_z(node->scale.z); + const vfloat minZ = madd(node->template dequantize((4*sizeof(vfloat)) >> 2),scale_z,start_z); + const vfloat maxZ = madd(node->template dequantize((5*sizeof(vfloat)) >> 2),scale_z,start_z); + return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid; + } + + template + __forceinline size_t pointQueryNodeAABB(const typename BVHN::QuantizedBaseNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat minX = node->dequantizeLowerX(time); + const vfloat maxX = node->dequantizeUpperX(time); + const vfloat minY = node->dequantizeLowerY(time); + const vfloat maxY = node->dequantizeUpperY(time); + const vfloat minZ = node->dequantizeLowerZ(time); + const vfloat maxZ = node->dequantizeUpperZ(time); + return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid; + } + + template + __forceinline size_t pointQueryNodeAABB(const typename BVHN::OBBNode* node, const TravPointQuery& query, vfloat& dist) + { + // TODO: point query - implement + const vbool vmask = vbool(true); + const size_t mask = movemask(vmask) & ((1<(0.0f); + return mask; + } + + template + __forceinline size_t pointQueryNodeAABB(const typename BVHN::OBBNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) + { + // TODO: point query - implement + const vbool vmask = vbool(true); + const size_t mask = movemask(vmask) & ((1<(0.0f); + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline size_t intersectNode(const typename BVHN::AABBNode* node, const TravRay& ray, vfloat& dist); + + template<> + __forceinline size_t intersectNode<4,4>(const typename BVH4::AABBNode* node, const TravRay<4,4,false>& ray, vfloat4& dist) + { +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat4 tFarX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tFarY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tFarZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); + const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); + const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); + const vfloat4 tFarX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); + const vfloat4 tFarY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); + const vfloat4 tFarZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); +#endif +#else + const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x; + const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y; + const vfloat4 tNearZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z; + const vfloat4 tFarX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x; + const vfloat4 tFarY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y; + const vfloat4 tFarZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z; +#endif + +#if defined(__aarch64__) + const vfloat4 tNear = maxi(tNearX, tNearY, tNearZ, ray.tnear); + const vfloat4 tFar = mini(tFarX, tFarY, tFarZ, ray.tfar); + const vbool4 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#elif defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW + const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1<<4)-1); +#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask; + } + +#if defined(__AVX__) + + template<> + __forceinline size_t intersectNode<8,8>(const typename BVH8::AABBNode* node, const TravRay<8,8,false>& ray, vfloat8& dist) + { +#if defined(__AVX2__) +#if defined(__aarch64__) + const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat8 tFarX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tFarY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tFarZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); + const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); + const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); + const vfloat8 tFarX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); + const vfloat8 tFarY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); + const vfloat8 tFarZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); +#endif + +#else + const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x; + const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y; + const vfloat8 tNearZ = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z; + const vfloat8 tFarX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x; + const vfloat8 tFarY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y; + const vfloat8 tFarZ = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z; +#endif + +#if defined(__AVX2__) && !defined(__AVX512F__) // HSW + const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1<<8)-1); +#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask; + } + +#endif + +#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL + + template<> + __forceinline size_t intersectNode<4,16>(const typename BVH4::AABBNode* node, const TravRay<4,16,false>& ray, vfloat16& dist) + { + const vfloat16 tNearX = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); + const vfloat16 tNearY = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); + const vfloat16 tNearZ = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); + const vfloat16 tFarX = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); + const vfloat16 tFarY = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); + const vfloat16 tFarZ = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); + const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool16 vmask = le(vbool16(0xf),tNear,tFar); + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + template<> + __forceinline size_t intersectNode<8,16>(const typename BVH8::AABBNode* node, const TravRay<8,16,false>& ray, vfloat16& dist) + { + const vllong8 invalid((size_t)BVH8::emptyNode); + const vboold8 m_valid(invalid != vllong8::loadu(node->children)); + const vfloat16 bminmaxX = permute(vfloat16::load((const float*)&node->lower_x), ray.permX); + const vfloat16 bminmaxY = permute(vfloat16::load((const float*)&node->lower_y), ray.permY); + const vfloat16 bminmaxZ = permute(vfloat16::load((const float*)&node->lower_z), ray.permZ); + const vfloat16 tNearFarX = msub(bminmaxX, ray.rdir.x, ray.org_rdir.x); + const vfloat16 tNearFarY = msub(bminmaxY, ray.rdir.y, ray.org_rdir.y); + const vfloat16 tNearFarZ = msub(bminmaxZ, ray.rdir.z, ray.org_rdir.z); + const vfloat16 tNear = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear); + const vfloat16 tFar = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar); + const vbool16 vmask = le(vboolf16(m_valid),tNear,align_shift_right<8>(tFar, tFar)); + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + +#endif + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline size_t intersectNodeRobust(const typename BVHN::AABBNode* node, const TravRay& ray, vfloat& dist) + { + const vfloat tNearX = (vfloat::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x; + const vfloat tNearY = (vfloat::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y; + const vfloat tNearZ = (vfloat::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z; + const vfloat tFarX = (vfloat::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x; + const vfloat tFarY = (vfloat::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y; + const vfloat tFarZ = (vfloat::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z; + const vfloat tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool vmask = tNear <= tFar; + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + +#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL + + template<> + __forceinline size_t intersectNodeRobust<4,16>(const typename BVHN<4>::AABBNode* node, const TravRay<4,16,true>& ray, vfloat<16>& dist) + { + const vfloat16 tNearX = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x; + const vfloat16 tNearY = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y; + const vfloat16 tNearZ = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z; + const vfloat16 tFarX = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x; + const vfloat16 tFarY = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y; + const vfloat16 tFarZ = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z; + const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool16 vmask = le((1 << 4)-1,tNear,tFar); + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + template<> + __forceinline size_t intersectNodeRobust<8,16>(const typename BVHN<8>::AABBNode* node, const TravRay<8,16,true>& ray, vfloat<16>& dist) + { + const vfloat16 tNearX = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x; + const vfloat16 tNearY = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y; + const vfloat16 tNearZ = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z; + const vfloat16 tFarX = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x; + const vfloat16 tFarY = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y; + const vfloat16 tFarZ = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z; + const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool16 vmask = le((1 << 8)-1,tNear,tFar); + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + +#endif + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline size_t intersectNode(const typename BVHN::AABBNodeMB* node, const TravRay& ray, const float time, vfloat& dist) + { + const vfloat* pNearX = (const vfloat*)((const char*)&node->lower_x+ray.nearX); + const vfloat* pNearY = (const vfloat*)((const char*)&node->lower_x+ray.nearY); + const vfloat* pNearZ = (const vfloat*)((const char*)&node->lower_x+ray.nearZ); + const vfloat* pFarX = (const vfloat*)((const char*)&node->lower_x+ray.farX); + const vfloat* pFarY = (const vfloat*)((const char*)&node->lower_x+ray.farY); + const vfloat* pFarZ = (const vfloat*)((const char*)&node->lower_x+ray.farZ); +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat tNearX = madd(madd(time,pNearX[6],vfloat(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat tNearY = madd(madd(time,pNearY[6],vfloat(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat tNearZ = madd(madd(time,pNearZ[6],vfloat(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat tFarX = madd(madd(time,pFarX [6],vfloat(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat tFarY = madd(madd(time,pFarY [6],vfloat(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat tFarZ = madd(madd(time,pFarZ [6],vfloat(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat tNearX = msub(madd(time,pNearX[6],vfloat(pNearX[0])), ray.rdir.x, ray.org_rdir.x); + const vfloat tNearY = msub(madd(time,pNearY[6],vfloat(pNearY[0])), ray.rdir.y, ray.org_rdir.y); + const vfloat tNearZ = msub(madd(time,pNearZ[6],vfloat(pNearZ[0])), ray.rdir.z, ray.org_rdir.z); + const vfloat tFarX = msub(madd(time,pFarX [6],vfloat(pFarX [0])), ray.rdir.x, ray.org_rdir.x); + const vfloat tFarY = msub(madd(time,pFarY [6],vfloat(pFarY [0])), ray.rdir.y, ray.org_rdir.y); + const vfloat tFarZ = msub(madd(time,pFarZ [6],vfloat(pFarZ [0])), ray.rdir.z, ray.org_rdir.z); +#endif +#else + const vfloat tNearX = (madd(time,pNearX[6],vfloat(pNearX[0])) - ray.org.x) * ray.rdir.x; + const vfloat tNearY = (madd(time,pNearY[6],vfloat(pNearY[0])) - ray.org.y) * ray.rdir.y; + const vfloat tNearZ = (madd(time,pNearZ[6],vfloat(pNearZ[0])) - ray.org.z) * ray.rdir.z; + const vfloat tFarX = (madd(time,pFarX [6],vfloat(pFarX [0])) - ray.org.x) * ray.rdir.x; + const vfloat tFarY = (madd(time,pFarY [6],vfloat(pFarY [0])) - ray.org.y) * ray.rdir.y; + const vfloat tFarZ = (madd(time,pFarZ [6],vfloat(pFarZ [0])) - ray.org.z) * ray.rdir.z; +#endif +#if defined(__FMA_X4__) && !defined(__AVX512F__) // HSW + const vfloat tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1< tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat tNear = max(ray.tnear,tNearX,tNearY,tNearZ); + const vfloat tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); + const vbool vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline size_t intersectNodeRobust(const typename BVHN::AABBNodeMB* node, const TravRay& ray, const float time, vfloat& dist) + { + const vfloat* pNearX = (const vfloat*)((const char*)&node->lower_x+ray.nearX); + const vfloat* pNearY = (const vfloat*)((const char*)&node->lower_x+ray.nearY); + const vfloat* pNearZ = (const vfloat*)((const char*)&node->lower_x+ray.nearZ); + const vfloat tNearX = (madd(time,pNearX[6],vfloat(pNearX[0])) - ray.org.x) * ray.rdir_near.x; + const vfloat tNearY = (madd(time,pNearY[6],vfloat(pNearY[0])) - ray.org.y) * ray.rdir_near.y; + const vfloat tNearZ = (madd(time,pNearZ[6],vfloat(pNearZ[0])) - ray.org.z) * ray.rdir_near.z; + const vfloat tNear = max(ray.tnear,tNearX,tNearY,tNearZ); + const vfloat* pFarX = (const vfloat*)((const char*)&node->lower_x+ray.farX); + const vfloat* pFarY = (const vfloat*)((const char*)&node->lower_x+ray.farY); + const vfloat* pFarZ = (const vfloat*)((const char*)&node->lower_x+ray.farZ); + const vfloat tFarX = (madd(time,pFarX[6],vfloat(pFarX[0])) - ray.org.x) * ray.rdir_far.x; + const vfloat tFarY = (madd(time,pFarY[6],vfloat(pFarY[0])) - ray.org.y) * ray.rdir_far.y; + const vfloat tFarZ = (madd(time,pFarZ[6],vfloat(pFarZ[0])) - ray.org.z) * ray.rdir_far.z; + const vfloat tFar = min(ray.tfar,tFarX,tFarY,tFarZ); + const size_t mask = movemask(tNear <= tFar); + dist = tNear; + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNodeMB4D intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline size_t intersectNodeMB4D(const typename BVHN::NodeRef ref, const TravRay& ray, const float time, vfloat& dist) + { + const typename BVHN::AABBNodeMB* node = ref.getAABBNodeMB(); + + const vfloat* pNearX = (const vfloat*)((const char*)&node->lower_x+ray.nearX); + const vfloat* pNearY = (const vfloat*)((const char*)&node->lower_x+ray.nearY); + const vfloat* pNearZ = (const vfloat*)((const char*)&node->lower_x+ray.nearZ); + const vfloat* pFarX = (const vfloat*)((const char*)&node->lower_x+ray.farX); + const vfloat* pFarY = (const vfloat*)((const char*)&node->lower_x+ray.farY); + const vfloat* pFarZ = (const vfloat*)((const char*)&node->lower_x+ray.farZ); +#if defined (__FMA_X4__) +#if defined(__aarch64__) + const vfloat tNearX = madd(madd(time,pNearX[6],vfloat(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat tNearY = madd(madd(time,pNearY[6],vfloat(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat tNearZ = madd(madd(time,pNearZ[6],vfloat(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat tFarX = madd(madd(time,pFarX [6],vfloat(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat tFarY = madd(madd(time,pFarY [6],vfloat(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat tFarZ = madd(madd(time,pFarZ [6],vfloat(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat tNearX = msub(madd(time,pNearX[6],vfloat(pNearX[0])), ray.rdir.x, ray.org_rdir.x); + const vfloat tNearY = msub(madd(time,pNearY[6],vfloat(pNearY[0])), ray.rdir.y, ray.org_rdir.y); + const vfloat tNearZ = msub(madd(time,pNearZ[6],vfloat(pNearZ[0])), ray.rdir.z, ray.org_rdir.z); + const vfloat tFarX = msub(madd(time,pFarX [6],vfloat(pFarX [0])), ray.rdir.x, ray.org_rdir.x); + const vfloat tFarY = msub(madd(time,pFarY [6],vfloat(pFarY [0])), ray.rdir.y, ray.org_rdir.y); + const vfloat tFarZ = msub(madd(time,pFarZ [6],vfloat(pFarZ [0])), ray.rdir.z, ray.org_rdir.z); +#endif +#else + const vfloat tNearX = (madd(time,pNearX[6],vfloat(pNearX[0])) - ray.org.x) * ray.rdir.x; + const vfloat tNearY = (madd(time,pNearY[6],vfloat(pNearY[0])) - ray.org.y) * ray.rdir.y; + const vfloat tNearZ = (madd(time,pNearZ[6],vfloat(pNearZ[0])) - ray.org.z) * ray.rdir.z; + const vfloat tFarX = (madd(time,pFarX [6],vfloat(pFarX [0])) - ray.org.x) * ray.rdir.x; + const vfloat tFarY = (madd(time,pFarY [6],vfloat(pFarY [0])) - ray.org.y) * ray.rdir.y; + const vfloat tFarZ = (madd(time,pFarZ [6],vfloat(pFarZ [0])) - ray.org.z) * ray.rdir.z; +#endif +#if defined(__FMA_X4__) && !defined(__AVX512F__) + const vfloat tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear)); + const vfloat tFar = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar )); +#else + const vfloat tNear = max(ray.tnear,tNearX,tNearY,tNearZ); + const vfloat tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); +#endif + vbool vmask = tNear <= tFar; + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN::AABBNodeMB4D* node1 = (const typename BVHN::AABBNodeMB4D*) node; + vmask &= (node1->lower_t <= time) & (time < node1->upper_t); + } + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNodeMB4D intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline size_t intersectNodeMB4DRobust(const typename BVHN::NodeRef ref, const TravRay& ray, const float time, vfloat& dist) + { + const typename BVHN::AABBNodeMB* node = ref.getAABBNodeMB(); + + const vfloat* pNearX = (const vfloat*)((const char*)&node->lower_x+ray.nearX); + const vfloat* pNearY = (const vfloat*)((const char*)&node->lower_x+ray.nearY); + const vfloat* pNearZ = (const vfloat*)((const char*)&node->lower_x+ray.nearZ); + const vfloat tNearX = (madd(time,pNearX[6],vfloat(pNearX[0])) - ray.org.x) * ray.rdir_near.x; + const vfloat tNearY = (madd(time,pNearY[6],vfloat(pNearY[0])) - ray.org.y) * ray.rdir_near.y; + const vfloat tNearZ = (madd(time,pNearZ[6],vfloat(pNearZ[0])) - ray.org.z) * ray.rdir_near.z; + const vfloat tNear = max(ray.tnear,tNearX,tNearY,tNearZ); + const vfloat* pFarX = (const vfloat*)((const char*)&node->lower_x+ray.farX); + const vfloat* pFarY = (const vfloat*)((const char*)&node->lower_x+ray.farY); + const vfloat* pFarZ = (const vfloat*)((const char*)&node->lower_x+ray.farZ); + const vfloat tFarX = (madd(time,pFarX[6],vfloat(pFarX[0])) - ray.org.x) * ray.rdir_far.x; + const vfloat tFarY = (madd(time,pFarY[6],vfloat(pFarY[0])) - ray.org.y) * ray.rdir_far.y; + const vfloat tFarZ = (madd(time,pFarZ[6],vfloat(pFarZ[0])) - ray.org.z) * ray.rdir_far.z; + const vfloat tFar = min(ray.tfar,tFarX,tFarY,tFarZ); + vbool vmask = tNear <= tFar; + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN::AABBNodeMB4D* node1 = (const typename BVHN::AABBNodeMB4D*) node; + vmask &= (node1->lower_t <= time) & (time < node1->upper_t); + } + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast QuantizedBaseNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline size_t intersectNode(const typename BVHN::QuantizedBaseNode* node, const TravRay& ray, vfloat& dist); + + template<> + __forceinline size_t intersectNode<4,4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,4,false>& ray, vfloat4& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat4 start_x(node->start.x); + const vfloat4 scale_x(node->scale.x); + const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x); + const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX >> 2),scale_x,start_x); + const vfloat4 start_y(node->start.y); + const vfloat4 scale_y(node->scale.y); + const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y); + const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY >> 2),scale_y,start_y); + const vfloat4 start_z(node->start.z); + const vfloat4 scale_z(node->scale.z); + const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z); + const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z); + +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat4 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat4 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat4 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat4 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#endif +#else + const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat4 tFarX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat4 tFarY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir.z; +#endif + +#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW + const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1<<4)-1); +#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask & mvalid; + } + + template<> + __forceinline size_t intersectNode<4,4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,4,true>& ray, vfloat4& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat4 start_x(node->start.x); + const vfloat4 scale_x(node->scale.x); + const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x); + const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX >> 2),scale_x,start_x); + const vfloat4 start_y(node->start.y); + const vfloat4 scale_y(node->scale.y); + const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y); + const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY >> 2),scale_y,start_y); + const vfloat4 start_z(node->start.z); + const vfloat4 scale_z(node->scale.z); + const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z); + const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z); + + const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; + const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; + const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; + const vfloat4 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; + const vfloat4 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; + const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; + + const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); + dist = tNear; + return mask & mvalid; + } + + +#if defined(__AVX__) + + template<> + __forceinline size_t intersectNode<8,8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,8,false>& ray, vfloat8& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat8 start_x(node->start.x); + const vfloat8 scale_x(node->scale.x); + const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x); + const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX >> 2),scale_x,start_x); + const vfloat8 start_y(node->start.y); + const vfloat8 scale_y(node->scale.y); + const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y); + const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY >> 2),scale_y,start_y); + const vfloat8 start_z(node->start.z); + const vfloat8 scale_z(node->scale.z); + const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z); + const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z); + +#if defined(__AVX2__) +#if defined(__aarch64__) + const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat8 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat8 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat8 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat8 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#endif +#else + const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat8 tFarX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat8 tFarY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat8 tFarZ = (upper_z - ray.org.z) * ray.rdir.z; +#endif + +#if defined(__AVX2__) && !defined(__AVX512F__) // HSW + const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1<<8)-1); +#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask & mvalid; + } + + template<> + __forceinline size_t intersectNode<8,8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,8,true>& ray, vfloat8& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat8 start_x(node->start.x); + const vfloat8 scale_x(node->scale.x); + const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x); + const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX >> 2),scale_x,start_x); + const vfloat8 start_y(node->start.y); + const vfloat8 scale_y(node->scale.y); + const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y); + const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY >> 2),scale_y,start_y); + const vfloat8 start_z(node->start.z); + const vfloat8 scale_z(node->scale.z); + const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z); + const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z); + + const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; + const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; + const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; + const vfloat8 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; + const vfloat8 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; + const vfloat8 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; + + const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); + + dist = tNear; + return mask & mvalid; + } + + +#endif + +#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL + + template<> + __forceinline size_t intersectNode<4,16>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,16,false>& ray, vfloat16& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat16 start_x(node->start.x); + const vfloat16 scale_x(node->scale.x); + const vfloat16 lower_x = madd(vfloat16(node->dequantize<4>(ray.nearX >> 2)),scale_x,start_x); + const vfloat16 upper_x = madd(vfloat16(node->dequantize<4>(ray.farX >> 2)),scale_x,start_x); + const vfloat16 start_y(node->start.y); + const vfloat16 scale_y(node->scale.y); + const vfloat16 lower_y = madd(vfloat16(node->dequantize<4>(ray.nearY >> 2)),scale_y,start_y); + const vfloat16 upper_y = madd(vfloat16(node->dequantize<4>(ray.farY >> 2)),scale_y,start_y); + const vfloat16 start_z(node->start.z); + const vfloat16 scale_z(node->scale.z); + const vfloat16 lower_z = madd(vfloat16(node->dequantize<4>(ray.nearZ >> 2)),scale_z,start_z); + const vfloat16 upper_z = madd(vfloat16(node->dequantize<4>(ray.farZ >> 2)),scale_z,start_z); + + const vfloat16 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat16 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat16 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat16 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat16 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat16 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); + const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool16 vmask = le(vbool16(0xf),tNear,tFar); + const size_t mask = movemask(vmask) & mvalid; + dist = tNear; + return mask; + } + + template<> + __forceinline size_t intersectNode<4,16>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,16,true>& ray, vfloat16& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat16 start_x(node->start.x); + const vfloat16 scale_x(node->scale.x); + const vfloat16 lower_x = madd(vfloat16(node->dequantize<4>(ray.nearX >> 2)),scale_x,start_x); + const vfloat16 upper_x = madd(vfloat16(node->dequantize<4>(ray.farX >> 2)),scale_x,start_x); + const vfloat16 start_y(node->start.y); + const vfloat16 scale_y(node->scale.y); + const vfloat16 lower_y = madd(vfloat16(node->dequantize<4>(ray.nearY >> 2)),scale_y,start_y); + const vfloat16 upper_y = madd(vfloat16(node->dequantize<4>(ray.farY >> 2)),scale_y,start_y); + const vfloat16 start_z(node->start.z); + const vfloat16 scale_z(node->scale.z); + const vfloat16 lower_z = madd(vfloat16(node->dequantize<4>(ray.nearZ >> 2)),scale_z,start_z); + const vfloat16 upper_z = madd(vfloat16(node->dequantize<4>(ray.farZ >> 2)),scale_z,start_z); + + const vfloat16 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; + const vfloat16 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; + const vfloat16 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; + const vfloat16 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; + const vfloat16 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; + const vfloat16 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; + + const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool16 vmask = le(vbool16(0xf),tNear,tFar); + const size_t mask = movemask(vmask) & mvalid; + dist = tNear; + return mask; + } + + template<> + __forceinline size_t intersectNode<8,16>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,16,false>& ray, vfloat16& dist) + { + const vbool16 m_valid(node->validMask16()); + const vfloat16 bminmaxX = node->dequantizeLowerUpperX(ray.permX); + const vfloat16 bminmaxY = node->dequantizeLowerUpperY(ray.permY); + const vfloat16 bminmaxZ = node->dequantizeLowerUpperZ(ray.permZ); + const vfloat16 tNearFarX = msub(bminmaxX, ray.rdir.x, ray.org_rdir.x); + const vfloat16 tNearFarY = msub(bminmaxY, ray.rdir.y, ray.org_rdir.y); + const vfloat16 tNearFarZ = msub(bminmaxZ, ray.rdir.z, ray.org_rdir.z); + const vfloat16 tNear = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear); + const vfloat16 tFar = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar); + const vbool16 vmask = le(m_valid,tNear,align_shift_right<8>(tFar, tFar)); + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + template<> + __forceinline size_t intersectNode<8,16>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,16,true>& ray, vfloat16& dist) + { + const vbool16 m_valid(node->validMask16()); + const vfloat16 bminmaxX = node->dequantizeLowerUpperX(ray.permX); + const vfloat16 bminmaxY = node->dequantizeLowerUpperY(ray.permY); + const vfloat16 bminmaxZ = node->dequantizeLowerUpperZ(ray.permZ); + const vfloat16 tNearFarX = (bminmaxX - ray.org.x) * ray.rdir_far.x; // FIXME: this is not conservative !!!!!!!!! + const vfloat16 tNearFarY = (bminmaxY - ray.org.y) * ray.rdir_far.y; + const vfloat16 tNearFarZ = (bminmaxZ - ray.org.z) * ray.rdir_far.z; + const vfloat16 tNear = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear); + const vfloat16 tFar = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar); + const vbool16 vmask = le(m_valid,tNear,align_shift_right<8>(tFar, tFar)); + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + +#endif + + + template + __forceinline size_t intersectNode(const typename BVHN::QuantizedBaseNodeMB* node, const TravRay& ray, const float time, vfloat& dist) + { + const vboolf mvalid = node->validMask(); + const vfloat lower_x = node->dequantizeLowerX(time); + const vfloat upper_x = node->dequantizeUpperX(time); + const vfloat lower_y = node->dequantizeLowerY(time); + const vfloat upper_y = node->dequantizeUpperY(time); + const vfloat lower_z = node->dequantizeLowerZ(time); + const vfloat upper_z = node->dequantizeUpperZ(time); +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#endif +#else + const vfloat tNearX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat tNearY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat tNearZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat tFarX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat tFarY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat tFarZ = (upper_z - ray.org.z) * ray.rdir.z; +#endif + + const vfloat tminX = mini(tNearX,tFarX); + const vfloat tmaxX = maxi(tNearX,tFarX); + const vfloat tminY = mini(tNearY,tFarY); + const vfloat tmaxY = maxi(tNearY,tFarY); + const vfloat tminZ = mini(tNearZ,tFarZ); + const vfloat tmaxZ = maxi(tNearZ,tFarZ); + const vfloat tNear = maxi(tminX,tminY,tminZ,ray.tnear); + const vfloat tFar = mini(tmaxX,tmaxY,tmaxZ,ray.tfar); +#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vbool vmask = le(mvalid,asInt(tNear),asInt(tFar)); +#else + const vbool vmask = (asInt(tNear) <= asInt(tFar)) & mvalid; +#endif + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + template + __forceinline size_t intersectNode(const typename BVHN::QuantizedBaseNodeMB* node, const TravRay& ray, const float time, vfloat& dist) + { + const vboolf mvalid = node->validMask(); + const vfloat lower_x = node->dequantizeLowerX(time); + const vfloat upper_x = node->dequantizeUpperX(time); + const vfloat lower_y = node->dequantizeLowerY(time); + const vfloat upper_y = node->dequantizeUpperY(time); + const vfloat lower_z = node->dequantizeLowerZ(time); + const vfloat upper_z = node->dequantizeUpperZ(time); + const vfloat tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; + const vfloat tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; + const vfloat tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; + const vfloat tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; + const vfloat tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; + const vfloat tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; + + const vfloat tminX = mini(tNearX,tFarX); + const vfloat tmaxX = maxi(tNearX,tFarX); + const vfloat tminY = mini(tNearY,tFarY); + const vfloat tmaxY = maxi(tNearY,tFarY); + const vfloat tminZ = mini(tNearZ,tFarZ); + const vfloat tmaxZ = maxi(tNearZ,tFarZ); + const vfloat tNear = maxi(tminX,tminY,tminZ,ray.tnear); + const vfloat tFar = mini(tmaxX,tmaxY,tmaxZ,ray.tfar); +#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vbool vmask = le(mvalid,asInt(tNear),asInt(tFar)); +#else + const vbool vmask = (asInt(tNear) <= asInt(tFar)) & mvalid; +#endif + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + +#if defined(__AVX512ER__) + // for KNL + template<> + __forceinline size_t intersectNode<4,16>(const typename BVHN<4>::QuantizedBaseNodeMB* node, const TravRay<4,16,false>& ray, const float time, vfloat<4>& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat16 lower_x = node->dequantizeLowerX(time); + const vfloat16 upper_x = node->dequantizeUpperX(time); + const vfloat16 lower_y = node->dequantizeLowerY(time); + const vfloat16 upper_y = node->dequantizeUpperY(time); + const vfloat16 lower_z = node->dequantizeLowerZ(time); + const vfloat16 upper_z = node->dequantizeUpperZ(time); + + const vfloat16 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat16 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat16 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat16 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat16 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat16 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); + + const vfloat16 tminX = min(tNearX,tFarX); + const vfloat16 tmaxX = max(tNearX,tFarX); + const vfloat16 tminY = min(tNearY,tFarY); + const vfloat16 tmaxY = max(tNearY,tFarY); + const vfloat16 tminZ = min(tNearZ,tFarZ); + const vfloat16 tmaxZ = max(tNearZ,tFarZ); + const vfloat16 tNear = max(tminX,tminY,tminZ,ray.tnear); + const vfloat16 tFar = min(tmaxX,tmaxY,tmaxZ,ray.tfar ); + const vbool16 vmask = tNear <= tFar; + const size_t mask = movemask(vmask) & mvalid; + dist = extractN<4,0>(tNear); + return mask; + } + + + // for KNL + template<> + __forceinline size_t intersectNode<4,16>(const typename BVHN<4>::QuantizedBaseNodeMB* node, const TravRay<4,16,true>& ray, const float time, vfloat<4>& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat16 lower_x = node->dequantizeLowerX(time); + const vfloat16 upper_x = node->dequantizeUpperX(time); + const vfloat16 lower_y = node->dequantizeLowerY(time); + const vfloat16 upper_y = node->dequantizeUpperY(time); + const vfloat16 lower_z = node->dequantizeLowerZ(time); + const vfloat16 upper_z = node->dequantizeUpperZ(time); + + const vfloat16 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; + const vfloat16 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; + const vfloat16 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; + const vfloat16 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; + const vfloat16 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; + const vfloat16 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; + + const vfloat16 tminX = min(tNearX,tFarX); + const vfloat16 tmaxX = max(tNearX,tFarX); + const vfloat16 tminY = min(tNearY,tFarY); + const vfloat16 tmaxY = max(tNearY,tFarY); + const vfloat16 tminZ = min(tNearZ,tFarZ); + const vfloat16 tmaxZ = max(tNearZ,tFarZ); + const vfloat16 tNear = max(tminX,tminY,tminZ,ray.tnear); + const vfloat16 tFar = min(tmaxX,tmaxY,tmaxZ,ray.tfar ); + const vbool16 vmask = tNear <= tFar; + const size_t mask = movemask(vmask) & mvalid; + dist = extractN<4,0>(tNear); + return mask; + } + +#endif + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast OBBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline size_t intersectNode(const typename BVHN::OBBNode* node, const TravRay& ray, vfloat& dist) + { + const Vec3vf dir = xfmVector(node->naabb,ray.dir); + //const Vec3vf nrdir = Vec3vf(vfloat(-1.0f))/dir; + const Vec3vf nrdir = Vec3vf(vfloat(-1.0f))*rcp_safe(dir); + const Vec3vf org = xfmPoint(node->naabb,ray.org); + const Vec3vf tLowerXYZ = org * nrdir; // (Vec3fa(zero) - org) * rdir; + const Vec3vf tUpperXYZ = tLowerXYZ - nrdir; // (Vec3fa(one ) - org) * rdir; + + const vfloat tNearX = mini(tLowerXYZ.x,tUpperXYZ.x); + const vfloat tNearY = mini(tLowerXYZ.y,tUpperXYZ.y); + const vfloat tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z); + const vfloat tFarX = maxi(tLowerXYZ.x,tUpperXYZ.x); + const vfloat tFarY = maxi(tLowerXYZ.y,tUpperXYZ.y); + const vfloat tFarZ = maxi(tLowerXYZ.z,tUpperXYZ.z); + vfloat tNear = max(ray.tnear, tNearX,tNearY,tNearZ); + vfloat tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); + if (robust) { + tNear = tNear*vfloat(1.0f-3.0f*float(ulp)); + tFar = tFar *vfloat(1.0f+3.0f*float(ulp)); + } + const vbool vmask = tNear <= tFar; + dist = tNear; + return movemask(vmask); + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast OBBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline size_t intersectNode(const typename BVHN::OBBNodeMB* node, const TravRay& ray, const float time, vfloat& dist) + { + const AffineSpace3vf xfm = node->space0; + const Vec3vf b0_lower = zero; + const Vec3vf b0_upper = one; + const Vec3vf lower = lerp(b0_lower,node->b1.lower,vfloat(time)); + const Vec3vf upper = lerp(b0_upper,node->b1.upper,vfloat(time)); + + const BBox3vf bounds(lower,upper); + const Vec3vf dir = xfmVector(xfm,ray.dir); + const Vec3vf rdir = rcp_safe(dir); + const Vec3vf org = xfmPoint(xfm,ray.org); + + const Vec3vf tLowerXYZ = (bounds.lower - org) * rdir; + const Vec3vf tUpperXYZ = (bounds.upper - org) * rdir; + + const vfloat tNearX = mini(tLowerXYZ.x,tUpperXYZ.x); + const vfloat tNearY = mini(tLowerXYZ.y,tUpperXYZ.y); + const vfloat tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z); + const vfloat tFarX = maxi(tLowerXYZ.x,tUpperXYZ.x); + const vfloat tFarY = maxi(tLowerXYZ.y,tUpperXYZ.y); + const vfloat tFarZ = maxi(tLowerXYZ.z,tUpperXYZ.z); + vfloat tNear = max(ray.tnear, tNearX,tNearY,tNearZ); + vfloat tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); + if (robust) { + tNear = tNear*vfloat(1.0f-3.0f*float(ulp)); + tFar = tFar *vfloat(1.0f+3.0f*float(ulp)); + } + const vbool vmask = tNear <= tFar; + dist = tNear; + return movemask(vmask); + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Node intersectors used in point query raversal + ////////////////////////////////////////////////////////////////////////////////////// + + /*! Computes traversal information for N nodes with 1 point query */ + template + struct BVHNNodePointQuerySphere1; + + template + struct BVHNNodePointQuerySphere1 + { + static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeSphere(node.getAABBNode(), query, dist); + return true; + } + }; + + template + struct BVHNNodePointQuerySphere1 + { + static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist); + return true; + } + }; + + template + struct BVHNNodePointQuerySphere1 + { + static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeSphereMB4D(node, query, time, dist); + return true; + } + }; + + template + struct BVHNNodePointQuerySphere1 + { + static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) + { + if (likely(node.isAABBNode())) mask = pointQueryNodeSphere(node.getAABBNode(), query, dist); + else if (unlikely(node.isOBBNode())) mask = pointQueryNodeSphere(node.ungetAABBNode(), query, dist); + else return false; + return true; + } + }; + + template + struct BVHNNodePointQuerySphere1 + { + static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) + { + if (likely(node.isAABBNodeMB())) mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist); + else if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist); + else return false; + return true; + } + }; + + template + struct BVHNNodePointQuerySphere1 + { + static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist); + else mask = pointQueryNodeSphereMB4D(node, query, time, dist); + return true; + } + }; + + template + struct BVHNNodePointQuerySphere1 + { + static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeSphere((const typename BVHN::QuantizedNode*)node.quantizedNode(), query, dist); + return true; + } + }; + + template + struct BVHNQuantizedBaseNodePointQuerySphere1 + { + static __forceinline size_t pointQuery(const typename BVHN::QuantizedBaseNode* node, const TravPointQuery& query, vfloat& dist) + { + return pointQueryNodeSphere(node,query,dist); + } + + static __forceinline size_t pointQuery(const typename BVHN::QuantizedBaseNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) + { + return pointQueryNodeSphere(node,query,time,dist); + } + }; + + /*! Computes traversal information for N nodes with 1 point query */ + template + struct BVHNNodePointQueryAABB1; + + template + struct BVHNNodePointQueryAABB1 + { + static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeAABB(node.getAABBNode(), query, dist); + return true; + } + }; + + template + struct BVHNNodePointQueryAABB1 + { + static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist); + return true; + } + }; + + template + struct BVHNNodePointQueryAABB1 + { + static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeAABBMB4D(node, query, time, dist); + return true; + } + }; + + template + struct BVHNNodePointQueryAABB1 + { + static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) + { + if (likely(node.isAABBNode())) mask = pointQueryNodeAABB(node.getAABBNode(), query, dist); + else if (unlikely(node.isOBBNode())) mask = pointQueryNodeAABB(node.ungetAABBNode(), query, dist); + else return false; + return true; + } + }; + + template + struct BVHNNodePointQueryAABB1 + { + static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) + { + if (likely(node.isAABBNodeMB())) mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist); + else if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist); + else return false; + return true; + } + }; + + template + struct BVHNNodePointQueryAABB1 + { + static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist); + else mask = pointQueryNodeAABBMB4D(node, query, time, dist); + return true; + } + }; + + template + struct BVHNNodePointQueryAABB1 + { + static __forceinline bool pointQuery(const typename BVHN::NodeRef& node, const TravPointQuery& query, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeAABB((const typename BVHN::QuantizedNode*)node.quantizedNode(), query, dist); + return true; + } + }; + + template + struct BVHNQuantizedBaseNodePointQueryAABB1 + { + static __forceinline size_t pointQuery(const typename BVHN::QuantizedBaseNode* node, const TravPointQuery& query, vfloat& dist) + { + return pointQueryNodeAABB(node,query,dist); + } + + static __forceinline size_t pointQuery(const typename BVHN::QuantizedBaseNodeMB* node, const TravPointQuery& query, const float time, vfloat& dist) + { + return pointQueryNodeAABB(node,query,time,dist); + } + }; + + + ////////////////////////////////////////////////////////////////////////////////////// + // Node intersectors used in ray traversal + ////////////////////////////////////////////////////////////////////////////////////// + + /*! Intersects N nodes with 1 ray */ + template + struct BVHNNodeIntersector1; + + template + struct BVHNNodeIntersector1 + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNode(node.getAABBNode(), ray, dist); + return true; + } + }; + + template + struct BVHNNodeIntersector1 + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeRobust(node.getAABBNode(), ray, dist); + return true; + } + }; + + template + struct BVHNNodeIntersector1 + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNode(node.getAABBNodeMB(), ray, time, dist); + return true; + } + }; + + template + struct BVHNNodeIntersector1 + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist); + return true; + } + }; + + template + struct BVHNNodeIntersector1 + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeMB4D(node, ray, time, dist); + return true; + } + }; + + template + struct BVHNNodeIntersector1 + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeMB4DRobust(node, ray, time, dist); + return true; + } + }; + + template + struct BVHNNodeIntersector1 + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) + { + if (likely(node.isAABBNode())) mask = intersectNode(node.getAABBNode(), ray, dist); + else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist); + else return false; + return true; + } + }; + + template + struct BVHNNodeIntersector1 + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) + { + if (likely(node.isAABBNode())) mask = intersectNodeRobust(node.getAABBNode(), ray, dist); + else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist); + else return false; + return true; + } + }; + + template + struct BVHNNodeIntersector1 + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) + { + if (likely(node.isAABBNodeMB())) mask = intersectNode(node.getAABBNodeMB(), ray, time, dist); + else if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); + else return false; + return true; + } + }; + + template + struct BVHNNodeIntersector1 + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) + { + if (likely(node.isAABBNodeMB())) mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist); + else if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); + else return false; + return true; + } + }; + + template + struct BVHNNodeIntersector1 + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); + else mask = intersectNodeMB4D(node, ray, time, dist); + return true; + } + }; + + template + struct BVHNNodeIntersector1 + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); + else mask = intersectNodeMB4DRobust(node, ray, time, dist); + return true; + } + }; + + template + struct BVHNNodeIntersector1 + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNode((const typename BVHN::QuantizedNode*)node.quantizedNode(), ray, dist); + return true; + } + }; + + template + struct BVHNNodeIntersector1 + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, const TravRay& ray, float time, vfloat& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeRobust((const typename BVHN::QuantizedNode*)node.quantizedNode(), ray, dist); + return true; + } + }; + + /*! Intersects N nodes with K rays */ + template + struct BVHNQuantizedBaseNodeIntersector1; + + template + struct BVHNQuantizedBaseNodeIntersector1 + { + static __forceinline size_t intersect(const typename BVHN::QuantizedBaseNode* node, const TravRay& ray, vfloat& dist) + { + return intersectNode(node,ray,dist); + } + + static __forceinline size_t intersect(const typename BVHN::QuantizedBaseNodeMB* node, const TravRay& ray, const float time, vfloat& dist) + { + return intersectNode(node,ray,time,dist); + } + + }; + + template + struct BVHNQuantizedBaseNodeIntersector1 + { + static __forceinline size_t intersect(const typename BVHN::QuantizedBaseNode* node, const TravRay& ray, vfloat& dist) + { + return intersectNode(node,ray,dist); + } + + static __forceinline size_t intersect(const typename BVHN::QuantizedBaseNodeMB* node, const TravRay& ray, const float time, vfloat& dist) + { + return intersectNode(node,ray,time,dist); + } + + }; + + + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h new file mode 100644 index 00000000000..800ac8b4782 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h @@ -0,0 +1,269 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector.h" + +namespace embree +{ + namespace isa + { + ////////////////////////////////////////////////////////////////////////////////////// + // Frustum structure used in hybrid and stream traversal + ////////////////////////////////////////////////////////////////////////////////////// + + /* + Optimized frustum test. We calculate t=(p-org)/dir in ray/box + intersection. We assume the rays are split by octant, thus + dir intervals are either positive or negative in each + dimension. + + Case 1: dir.min >= 0 && dir.max >= 0: + t_min = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min + t_max = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max + + Case 2: dir.min < 0 && dir.max < 0: + t_min = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max + t_max = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min + */ + + template + struct Frustum; + + /* Fast variant */ + template<> + struct Frustum + { + __forceinline Frustum() {} + + template + __forceinline Frustum(const vbool& valid, const Vec3vf& org, const Vec3vf& rdir, const vfloat& ray_tnear, const vfloat& ray_tfar, int N) + { + init(valid, org, rdir, ray_tnear, ray_tfar, N); + } + + template + __forceinline void init(const vbool& valid, const Vec3vf& org, const Vec3vf& rdir, const vfloat& ray_tnear, const vfloat& ray_tfar, int N) + { + const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)), + reduce_min(select(valid, org.y, pos_inf)), + reduce_min(select(valid, org.z, pos_inf))); + + const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)), + reduce_max(select(valid, org.y, neg_inf)), + reduce_max(select(valid, org.z, neg_inf))); + + const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)), + reduce_min(select(valid, rdir.y, pos_inf)), + reduce_min(select(valid, rdir.z, pos_inf))); + + const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)), + reduce_max(select(valid, rdir.y, neg_inf)), + reduce_max(select(valid, rdir.z, neg_inf))); + + const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat(pos_inf))); + const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat(neg_inf))); + + init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N); + } + + __forceinline void init(const Vec3fa& reduced_min_org, + const Vec3fa& reduced_max_org, + const Vec3fa& reduced_min_rdir, + const Vec3fa& reduced_max_rdir, + float reduced_min_dist, + float reduced_max_dist, + int N) + { + const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero)); + + min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir); + max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir); + +#if defined (__aarch64__) + neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org)); + neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org)); +#else + min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org); + max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org); +#endif + min_dist = reduced_min_dist; + max_dist = reduced_max_dist; + + nf = NearFarPrecalculations(min_rdir, N); + } + + template + __forceinline void updateMaxDist(const vfloat& ray_tfar) + { + max_dist = reduce_max(ray_tfar); + } + + NearFarPrecalculations nf; + + Vec3fa min_rdir; + Vec3fa max_rdir; + +#if defined (__aarch64__) + Vec3fa neg_min_org_rdir; + Vec3fa neg_max_org_rdir; +#else + Vec3fa min_org_rdir; + Vec3fa max_org_rdir; +#endif + float min_dist; + float max_dist; + }; + + typedef Frustum FrustumFast; + + /* Robust variant */ + template<> + struct Frustum + { + __forceinline Frustum() {} + + template + __forceinline Frustum(const vbool& valid, const Vec3vf& org, const Vec3vf& rdir, const vfloat& ray_tnear, const vfloat& ray_tfar, int N) + { + init(valid, org, rdir, ray_tnear, ray_tfar, N); + } + + template + __forceinline void init(const vbool& valid, const Vec3vf& org, const Vec3vf& rdir, const vfloat& ray_tnear, const vfloat& ray_tfar, int N) + { + const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)), + reduce_min(select(valid, org.y, pos_inf)), + reduce_min(select(valid, org.z, pos_inf))); + + const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)), + reduce_max(select(valid, org.y, neg_inf)), + reduce_max(select(valid, org.z, neg_inf))); + + const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)), + reduce_min(select(valid, rdir.y, pos_inf)), + reduce_min(select(valid, rdir.z, pos_inf))); + + const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)), + reduce_max(select(valid, rdir.y, neg_inf)), + reduce_max(select(valid, rdir.z, neg_inf))); + + const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat(pos_inf))); + const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat(neg_inf))); + + init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N); + } + + __forceinline void init(const Vec3fa& reduced_min_org, + const Vec3fa& reduced_max_org, + const Vec3fa& reduced_min_rdir, + const Vec3fa& reduced_max_rdir, + float reduced_min_dist, + float reduced_max_dist, + int N) + { + const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero)); + min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir); + max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir); + + min_org = select(pos_rdir, reduced_max_org, reduced_min_org); + max_org = select(pos_rdir, reduced_min_org, reduced_max_org); + + min_dist = reduced_min_dist; + max_dist = reduced_max_dist; + + nf = NearFarPrecalculations(min_rdir, N); + } + + template + __forceinline void updateMaxDist(const vfloat& ray_tfar) + { + max_dist = reduce_max(ray_tfar); + } + + NearFarPrecalculations nf; + + Vec3fa min_rdir; + Vec3fa max_rdir; + + Vec3fa min_org; + Vec3fa max_org; + + float min_dist; + float max_dist; + }; + + typedef Frustum FrustumRobust; + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline size_t intersectNodeFrustum(const typename BVHN::AABBNode* __restrict__ node, + const FrustumFast& frustum, vfloat& dist) + { + const vfloat bminX = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.nearX); + const vfloat bminY = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.nearY); + const vfloat bminZ = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.nearZ); + const vfloat bmaxX = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.farX); + const vfloat bmaxY = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.farY); + const vfloat bmaxZ = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.farZ); + +#if defined (__aarch64__) + const vfloat fminX = madd(bminX, vfloat(frustum.min_rdir.x), vfloat(frustum.neg_min_org_rdir.x)); + const vfloat fminY = madd(bminY, vfloat(frustum.min_rdir.y), vfloat(frustum.neg_min_org_rdir.y)); + const vfloat fminZ = madd(bminZ, vfloat(frustum.min_rdir.z), vfloat(frustum.neg_min_org_rdir.z)); + const vfloat fmaxX = madd(bmaxX, vfloat(frustum.max_rdir.x), vfloat(frustum.neg_max_org_rdir.x)); + const vfloat fmaxY = madd(bmaxY, vfloat(frustum.max_rdir.y), vfloat(frustum.neg_max_org_rdir.y)); + const vfloat fmaxZ = madd(bmaxZ, vfloat(frustum.max_rdir.z), vfloat(frustum.neg_max_org_rdir.z)); +#else + const vfloat fminX = msub(bminX, vfloat(frustum.min_rdir.x), vfloat(frustum.min_org_rdir.x)); + const vfloat fminY = msub(bminY, vfloat(frustum.min_rdir.y), vfloat(frustum.min_org_rdir.y)); + const vfloat fminZ = msub(bminZ, vfloat(frustum.min_rdir.z), vfloat(frustum.min_org_rdir.z)); + const vfloat fmaxX = msub(bmaxX, vfloat(frustum.max_rdir.x), vfloat(frustum.max_org_rdir.x)); + const vfloat fmaxY = msub(bmaxY, vfloat(frustum.max_rdir.y), vfloat(frustum.max_org_rdir.y)); + const vfloat fmaxZ = msub(bmaxZ, vfloat(frustum.max_rdir.z), vfloat(frustum.max_org_rdir.z)); +#endif + const vfloat fmin = maxi(fminX, fminY, fminZ, vfloat(frustum.min_dist)); + dist = fmin; + const vfloat fmax = mini(fmaxX, fmaxY, fmaxZ, vfloat(frustum.max_dist)); + const vbool vmask_node_hit = fmin <= fmax; + size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1); + return m_node; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline size_t intersectNodeFrustum(const typename BVHN::AABBNode* __restrict__ node, + const FrustumRobust& frustum, vfloat& dist) + { + const vfloat bminX = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.nearX); + const vfloat bminY = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.nearY); + const vfloat bminZ = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.nearZ); + const vfloat bmaxX = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.farX); + const vfloat bmaxY = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.farY); + const vfloat bmaxZ = *(const vfloat*)((const char*)&node->lower_x + frustum.nf.farZ); + + const vfloat fminX = (bminX - vfloat(frustum.min_org.x)) * vfloat(frustum.min_rdir.x); + const vfloat fminY = (bminY - vfloat(frustum.min_org.y)) * vfloat(frustum.min_rdir.y); + const vfloat fminZ = (bminZ - vfloat(frustum.min_org.z)) * vfloat(frustum.min_rdir.z); + const vfloat fmaxX = (bmaxX - vfloat(frustum.max_org.x)) * vfloat(frustum.max_rdir.x); + const vfloat fmaxY = (bmaxY - vfloat(frustum.max_org.y)) * vfloat(frustum.max_rdir.y); + const vfloat fmaxZ = (bmaxZ - vfloat(frustum.max_org.z)) * vfloat(frustum.max_rdir.z); + + const float round_down = 1.0f-2.0f*float(ulp); // FIXME: use per instruction rounding for AVX512 + const float round_up = 1.0f+2.0f*float(ulp); + const vfloat fmin = max(fminX, fminY, fminZ, vfloat(frustum.min_dist)); + dist = fmin; + const vfloat fmax = min(fmaxX, fmaxY, fmaxZ, vfloat(frustum.max_dist)); + const vbool vmask_node_hit = (round_down*fmin <= round_up*fmax); + size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1); + return m_node; + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h new file mode 100644 index 00000000000..0543e56f8ea --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h @@ -0,0 +1,843 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector.h" + +namespace embree +{ + namespace isa + { + ////////////////////////////////////////////////////////////////////////////////////// + // Ray packet structure used in hybrid traversal + ////////////////////////////////////////////////////////////////////////////////////// + + template + struct TravRayK; + + /* Fast variant */ + template + struct TravRayK + { + __forceinline TravRayK() {} + + __forceinline TravRayK(const Vec3vf& ray_org, const Vec3vf& ray_dir, int N) + { + init(ray_org, ray_dir, N); + } + + __forceinline TravRayK(const Vec3vf& ray_org, const Vec3vf& ray_dir, const vfloat& ray_tnear, const vfloat& ray_tfar, int N) + { + init(ray_org, ray_dir, N); + tnear = ray_tnear; + tfar = ray_tfar; + } + + __forceinline void init(const Vec3vf& ray_org, const Vec3vf& ray_dir, int N) + { + org = ray_org; + dir = ray_dir; + rdir = rcp_safe(ray_dir); +#if defined(__aarch64__) + neg_org_rdir = -(org * rdir); +#elif defined(__AVX2__) + org_rdir = org * rdir; +#endif + if (N) + { + const int size = sizeof(float)*N; + nearXYZ.x = select(rdir.x >= 0.0f, vint(0*size), vint(1*size)); + nearXYZ.y = select(rdir.y >= 0.0f, vint(2*size), vint(3*size)); + nearXYZ.z = select(rdir.z >= 0.0f, vint(4*size), vint(5*size)); + } + } + + Vec3vf org; + Vec3vf dir; + Vec3vf rdir; +#if defined(__aarch64__) + Vec3vf neg_org_rdir; +#elif defined(__AVX2__) + Vec3vf org_rdir; +#endif + Vec3vi nearXYZ; + vfloat tnear; + vfloat tfar; + }; + + template + using TravRayKFast = TravRayK; + + /* Robust variant */ + template + struct TravRayK + { + __forceinline TravRayK() {} + + __forceinline TravRayK(const Vec3vf& ray_org, const Vec3vf& ray_dir, int N) + { + init(ray_org, ray_dir, N); + } + + __forceinline TravRayK(const Vec3vf& ray_org, const Vec3vf& ray_dir, const vfloat& ray_tnear, const vfloat& ray_tfar, int N) + { + init(ray_org, ray_dir, N); + tnear = ray_tnear; + tfar = ray_tfar; + } + + __forceinline void init(const Vec3vf& ray_org, const Vec3vf& ray_dir, int N) + { + org = ray_org; + dir = ray_dir; + rdir = vfloat(1.0f)/(zero_fix(ray_dir)); + + if (N) + { + const int size = sizeof(float)*N; + nearXYZ.x = select(rdir.x >= 0.0f, vint(0*size), vint(1*size)); + nearXYZ.y = select(rdir.y >= 0.0f, vint(2*size), vint(3*size)); + nearXYZ.z = select(rdir.z >= 0.0f, vint(4*size), vint(5*size)); + } + } + + Vec3vf org; + Vec3vf dir; + Vec3vf rdir; + Vec3vi nearXYZ; + vfloat tnear; + vfloat tfar; + }; + + template + using TravRayKRobust = TravRayK; + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vbool intersectNodeK(const typename BVHN::AABBNode* node, size_t i, + const TravRayKFast& ray, vfloat& dist) + + { +#if defined(__aarch64__) + const vfloat lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z); + const vfloat lclipMaxX = madd(node->upper_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat lclipMaxY = madd(node->upper_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat lclipMaxZ = madd(node->upper_z[i], ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) + const vfloat lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x); + const vfloat lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y); + const vfloat lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z); + const vfloat lclipMaxX = msub(node->upper_x[i], ray.rdir.x, ray.org_rdir.x); + const vfloat lclipMaxY = msub(node->upper_y[i], ray.rdir.y, ray.org_rdir.y); + const vfloat lclipMaxZ = msub(node->upper_z[i], ray.rdir.z, ray.org_rdir.z); + #else + const vfloat lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x; + const vfloat lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y; + const vfloat lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z; + const vfloat lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x; + const vfloat lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y; + const vfloat lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z; + #endif + + #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + if (K == 16) + { + /* use mixed float/int min/max */ + const vfloat lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + dist = lnearP; + return lhit; + } + else + #endif + { + const vfloat lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + const vfloat lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vbool lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + #else + const vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + #endif + dist = lnearP; + return lhit; + } + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vbool intersectNodeKRobust(const typename BVHN::AABBNode* node, size_t i, + const TravRayKRobust& ray, vfloat& dist) + { + // FIXME: use per instruction rounding for AVX512 + const vfloat lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x; + const vfloat lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y; + const vfloat lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z; + const vfloat lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x; + const vfloat lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y; + const vfloat lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z; + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + const vfloat lnearP = round_down*max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ)); + const vfloat lfarP = round_up *min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ)); + const vbool lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vbool intersectNodeK(const typename BVHN::AABBNodeMB* node, const size_t i, + const TravRayKFast& ray, const vfloat& time, vfloat& dist) + { + const vfloat vlower_x = madd(time, vfloat(node->lower_dx[i]), vfloat(node->lower_x[i])); + const vfloat vlower_y = madd(time, vfloat(node->lower_dy[i]), vfloat(node->lower_y[i])); + const vfloat vlower_z = madd(time, vfloat(node->lower_dz[i]), vfloat(node->lower_z[i])); + const vfloat vupper_x = madd(time, vfloat(node->upper_dx[i]), vfloat(node->upper_x[i])); + const vfloat vupper_y = madd(time, vfloat(node->upper_dy[i]), vfloat(node->upper_y[i])); + const vfloat vupper_z = madd(time, vfloat(node->upper_dz[i]), vfloat(node->upper_z[i])); + +#if defined(__aarch64__) + const vfloat lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) + const vfloat lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z); +#else + const vfloat lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; + const vfloat lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; + const vfloat lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; + const vfloat lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; + const vfloat lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; + const vfloat lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; +#endif + +#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + if (K == 16) + { + /* use mixed float/int min/max */ + const vfloat lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + dist = lnearP; + return lhit; + } + else +#endif + { + const vfloat lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + const vfloat lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); +#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vbool lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); +#else + const vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); +#endif + dist = lnearP; + return lhit; + } + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vbool intersectNodeKRobust(const typename BVHN::AABBNodeMB* node, const size_t i, + const TravRayKRobust& ray, const vfloat& time, vfloat& dist) + { + const vfloat vlower_x = madd(time, vfloat(node->lower_dx[i]), vfloat(node->lower_x[i])); + const vfloat vlower_y = madd(time, vfloat(node->lower_dy[i]), vfloat(node->lower_y[i])); + const vfloat vlower_z = madd(time, vfloat(node->lower_dz[i]), vfloat(node->lower_z[i])); + const vfloat vupper_x = madd(time, vfloat(node->upper_dx[i]), vfloat(node->upper_x[i])); + const vfloat vupper_y = madd(time, vfloat(node->upper_dy[i]), vfloat(node->upper_y[i])); + const vfloat vupper_z = madd(time, vfloat(node->upper_dz[i]), vfloat(node->upper_z[i])); + + const vfloat lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; + const vfloat lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; + const vfloat lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; + const vfloat lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; + const vfloat lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; + const vfloat lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + +#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + if (K == 16) + { + const vfloat lnearP = round_down*maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat lfarP = round_up *mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + else +#endif + { + const vfloat lnearP = round_down*maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + const vfloat lfarP = round_up *mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + const vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNodeMB4D intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vbool intersectNodeKMB4D(const typename BVHN::NodeRef ref, const size_t i, + const TravRayKFast& ray, const vfloat& time, vfloat& dist) + { + const typename BVHN::AABBNodeMB* node = ref.getAABBNodeMB(); + + const vfloat vlower_x = madd(time, vfloat(node->lower_dx[i]), vfloat(node->lower_x[i])); + const vfloat vlower_y = madd(time, vfloat(node->lower_dy[i]), vfloat(node->lower_y[i])); + const vfloat vlower_z = madd(time, vfloat(node->lower_dz[i]), vfloat(node->lower_z[i])); + const vfloat vupper_x = madd(time, vfloat(node->upper_dx[i]), vfloat(node->upper_x[i])); + const vfloat vupper_y = madd(time, vfloat(node->upper_dy[i]), vfloat(node->upper_y[i])); + const vfloat vupper_z = madd(time, vfloat(node->upper_dz[i]), vfloat(node->upper_z[i])); + +#if defined(__aarch64__) + const vfloat lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) + const vfloat lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z); +#else + const vfloat lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; + const vfloat lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; + const vfloat lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; + const vfloat lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; + const vfloat lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; + const vfloat lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; +#endif + + const vfloat lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ)); + const vfloat lfarP = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ)); + vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN::AABBNodeMB4D* node1 = (const typename BVHN::AABBNodeMB4D*) node; + lhit = lhit & (vfloat(node1->lower_t[i]) <= time) & (time < vfloat(node1->upper_t[i])); + } + dist = lnearP; + return lhit; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNodeMB4D intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vbool intersectNodeKMB4DRobust(const typename BVHN::NodeRef ref, const size_t i, + const TravRayKRobust& ray, const vfloat& time, vfloat& dist) + { + const typename BVHN::AABBNodeMB* node = ref.getAABBNodeMB(); + + const vfloat vlower_x = madd(time, vfloat(node->lower_dx[i]), vfloat(node->lower_x[i])); + const vfloat vlower_y = madd(time, vfloat(node->lower_dy[i]), vfloat(node->lower_y[i])); + const vfloat vlower_z = madd(time, vfloat(node->lower_dz[i]), vfloat(node->lower_z[i])); + const vfloat vupper_x = madd(time, vfloat(node->upper_dx[i]), vfloat(node->upper_x[i])); + const vfloat vupper_y = madd(time, vfloat(node->upper_dy[i]), vfloat(node->upper_y[i])); + const vfloat vupper_z = madd(time, vfloat(node->upper_dz[i]), vfloat(node->upper_z[i])); + + const vfloat lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; + const vfloat lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; + const vfloat lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; + const vfloat lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; + const vfloat lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; + const vfloat lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + const vfloat lnearP = round_down*maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ)); + const vfloat lfarP = round_up *mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ)); + vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN::AABBNodeMB4D* node1 = (const typename BVHN::AABBNodeMB4D*) node; + lhit = lhit & (vfloat(node1->lower_t[i]) <= time) & (time < vfloat(node1->upper_t[i])); + } + dist = lnearP; + return lhit; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast OBBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vbool intersectNodeK(const typename BVHN::OBBNode* node, const size_t i, + const TravRayK& ray, vfloat& dist) + { + const AffineSpace3vf naabb(Vec3f(node->naabb.l.vx.x[i], node->naabb.l.vx.y[i], node->naabb.l.vx.z[i]), + Vec3f(node->naabb.l.vy.x[i], node->naabb.l.vy.y[i], node->naabb.l.vy.z[i]), + Vec3f(node->naabb.l.vz.x[i], node->naabb.l.vz.y[i], node->naabb.l.vz.z[i]), + Vec3f(node->naabb.p .x[i], node->naabb.p .y[i], node->naabb.p .z[i])); + + const Vec3vf dir = xfmVector(naabb, ray.dir); + const Vec3vf nrdir = Vec3vf(vfloat(-1.0f)) * rcp_safe(dir); // FIXME: negate instead of mul with -1? + const Vec3vf org = xfmPoint(naabb, ray.org); + + const vfloat lclipMinX = org.x * nrdir.x; // (Vec3fa(zero) - org) * rdir; + const vfloat lclipMinY = org.y * nrdir.y; + const vfloat lclipMinZ = org.z * nrdir.z; + const vfloat lclipMaxX = lclipMinX - nrdir.x; // (Vec3fa(one) - org) * rdir; + const vfloat lclipMaxY = lclipMinY - nrdir.y; + const vfloat lclipMaxZ = lclipMinZ - nrdir.z; + + vfloat lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + vfloat lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + if (robust) { + lnearP = lnearP*vfloat(1.0f-3.0f*float(ulp)); + lfarP = lfarP *vfloat(1.0f+3.0f*float(ulp)); + } + const vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast OBBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vbool intersectNodeK(const typename BVHN::OBBNodeMB* node, const size_t i, + const TravRayK& ray, const vfloat& time, vfloat& dist) + { + const AffineSpace3vf xfm(Vec3f(node->space0.l.vx.x[i], node->space0.l.vx.y[i], node->space0.l.vx.z[i]), + Vec3f(node->space0.l.vy.x[i], node->space0.l.vy.y[i], node->space0.l.vy.z[i]), + Vec3f(node->space0.l.vz.x[i], node->space0.l.vz.y[i], node->space0.l.vz.z[i]), + Vec3f(node->space0.p .x[i], node->space0.p .y[i], node->space0.p .z[i])); + + const Vec3vf b0_lower = zero; + const Vec3vf b0_upper = one; + const Vec3vf b1_lower(node->b1.lower.x[i], node->b1.lower.y[i], node->b1.lower.z[i]); + const Vec3vf b1_upper(node->b1.upper.x[i], node->b1.upper.y[i], node->b1.upper.z[i]); + const Vec3vf lower = lerp(b0_lower, b1_lower, time); + const Vec3vf upper = lerp(b0_upper, b1_upper, time); + + const Vec3vf dir = xfmVector(xfm, ray.dir); + const Vec3vf rdir = rcp_safe(dir); + const Vec3vf org = xfmPoint(xfm, ray.org); + + const vfloat lclipMinX = (lower.x - org.x) * rdir.x; + const vfloat lclipMinY = (lower.y - org.y) * rdir.y; + const vfloat lclipMinZ = (lower.z - org.z) * rdir.z; + const vfloat lclipMaxX = (upper.x - org.x) * rdir.x; + const vfloat lclipMaxY = (upper.y - org.y) * rdir.y; + const vfloat lclipMaxZ = (upper.z - org.z) * rdir.z; + + vfloat lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + vfloat lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + if (robust) { + lnearP = lnearP*vfloat(1.0f-3.0f*float(ulp)); + lfarP = lfarP *vfloat(1.0f+3.0f*float(ulp)); + } + + const vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + + + ////////////////////////////////////////////////////////////////////////////////////// + // QuantizedBaseNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline vbool intersectQuantizedNodeK(const typename BVHN::QuantizedBaseNode* node, size_t i, + const TravRayK& ray, vfloat& dist) + + { + assert(movemask(node->validMask()) & ((size_t)1 << i)); + const vfloat lower_x = node->dequantizeLowerX(); + const vfloat upper_x = node->dequantizeUpperX(); + const vfloat lower_y = node->dequantizeLowerY(); + const vfloat upper_y = node->dequantizeUpperY(); + const vfloat lower_z = node->dequantizeLowerZ(); + const vfloat upper_z = node->dequantizeUpperZ(); + + #if defined(__aarch64__) + const vfloat lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z); + const vfloat lclipMaxX = madd(upper_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat lclipMaxY = madd(upper_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat lclipMaxZ = madd(upper_z[i], ray.rdir.z, ray.neg_org_rdir.z); + #elif defined(__AVX2__) + const vfloat lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x); + const vfloat lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y); + const vfloat lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z); + const vfloat lclipMaxX = msub(upper_x[i], ray.rdir.x, ray.org_rdir.x); + const vfloat lclipMaxY = msub(upper_y[i], ray.rdir.y, ray.org_rdir.y); + const vfloat lclipMaxZ = msub(upper_z[i], ray.rdir.z, ray.org_rdir.z); + #else + const vfloat lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x; + const vfloat lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y; + const vfloat lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z; + const vfloat lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x; + const vfloat lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y; + const vfloat lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z; + #endif + + #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + if (K == 16) + { + /* use mixed float/int min/max */ + const vfloat lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + dist = lnearP; + return lhit; + } + else + #endif + { + const vfloat lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + const vfloat lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vbool lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + #else + const vbool lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + #endif + dist = lnearP; + return lhit; + } + } + + template + __forceinline vbool intersectQuantizedNodeK(const typename BVHN::QuantizedBaseNode* node, size_t i, + const TravRayK& ray, vfloat& dist) + + { + assert(movemask(node->validMask()) & ((size_t)1 << i)); + const vfloat lower_x = node->dequantizeLowerX(); + const vfloat upper_x = node->dequantizeUpperX(); + const vfloat lower_y = node->dequantizeLowerY(); + const vfloat upper_y = node->dequantizeUpperY(); + const vfloat lower_z = node->dequantizeLowerZ(); + const vfloat upper_z = node->dequantizeUpperZ(); + + const vfloat lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x; + const vfloat lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y; + const vfloat lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z; + const vfloat lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x; + const vfloat lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y; + const vfloat lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + + const vfloat lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat lfarP = round_up *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + template + __forceinline vbool intersectQuantizedNodeMBK(const typename BVHN::QuantizedBaseNodeMB* node, const size_t i, + const TravRayK& ray, const vfloat& time, vfloat& dist) + + { + assert(movemask(node->validMask()) & ((size_t)1 << i)); + + const vfloat lower_x = node->dequantizeLowerX(i,time); + const vfloat upper_x = node->dequantizeUpperX(i,time); + const vfloat lower_y = node->dequantizeLowerY(i,time); + const vfloat upper_y = node->dequantizeUpperY(i,time); + const vfloat lower_z = node->dequantizeLowerZ(i,time); + const vfloat upper_z = node->dequantizeUpperZ(i,time); + +#if defined(__aarch64__) + const vfloat lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat lclipMaxX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat lclipMaxY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat lclipMaxZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) + const vfloat lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat lclipMaxX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat lclipMaxY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat lclipMaxZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#else + const vfloat lclipMinX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat lclipMinY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z; + #endif + const vfloat lnearP = max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat lfarP = min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + + template + __forceinline vbool intersectQuantizedNodeMBK(const typename BVHN::QuantizedBaseNodeMB* node, const size_t i, + const TravRayK& ray, const vfloat& time, vfloat& dist) + + { + assert(movemask(node->validMask()) & ((size_t)1 << i)); + + const vfloat lower_x = node->dequantizeLowerX(i,time); + const vfloat upper_x = node->dequantizeUpperX(i,time); + const vfloat lower_y = node->dequantizeLowerY(i,time); + const vfloat upper_y = node->dequantizeUpperY(i,time); + const vfloat lower_z = node->dequantizeLowerZ(i,time); + const vfloat upper_z = node->dequantizeUpperZ(i,time); + + const vfloat lclipMinX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat lclipMinY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + + const vfloat lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat lfarP = round_up *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + + ////////////////////////////////////////////////////////////////////////////////////// + // Node intersectors used in hybrid traversal + ////////////////////////////////////////////////////////////////////////////////////// + + /*! Intersects N nodes with K rays */ + template + struct BVHNNodeIntersectorK; + + template + struct BVHNNodeIntersectorK + { + /* vmask is both an input and an output parameter! Its initial value should be the parent node + hit mask, which is used for correctly computing the current hit mask. The parent hit mask + is actually required only for motion blur node intersections (because different rays may + have different times), so for regular nodes vmask is simply overwritten. */ + static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, + const TravRayKFast& ray, const vfloat& time, vfloat& dist, vbool& vmask) + { + vmask = intersectNodeK(node.getAABBNode(), i, ray, dist); + return true; + } + }; + + template + struct BVHNNodeIntersectorK + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, + const TravRayKRobust& ray, const vfloat& time, vfloat& dist, vbool& vmask) + { + vmask = intersectNodeKRobust(node.getAABBNode(), i, ray, dist); + return true; + } + }; + + template + struct BVHNNodeIntersectorK + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, + const TravRayKFast& ray, const vfloat& time, vfloat& dist, vbool& vmask) + { + vmask = intersectNodeK(node.getAABBNodeMB(), i, ray, time, dist); + return true; + } + }; + + template + struct BVHNNodeIntersectorK + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, + const TravRayKRobust& ray, const vfloat& time, vfloat& dist, vbool& vmask) + { + vmask = intersectNodeKRobust(node.getAABBNodeMB(), i, ray, time, dist); + return true; + } + }; + + template + struct BVHNNodeIntersectorK + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, + const TravRayKFast& ray, const vfloat& time, vfloat& dist, vbool& vmask) + { + if (likely(node.isAABBNode())) vmask = intersectNodeK(node.getAABBNode(), i, ray, dist); + else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK(node.ungetAABBNode(), i, ray, dist); + return true; + } + }; + + template + struct BVHNNodeIntersectorK + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, + const TravRayKRobust& ray, const vfloat& time, vfloat& dist, vbool& vmask) + { + if (likely(node.isAABBNode())) vmask = intersectNodeKRobust(node.getAABBNode(), i, ray, dist); + else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK(node.ungetAABBNode(), i, ray, dist); + return true; + } + }; + + template + struct BVHNNodeIntersectorK + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, + const TravRayKFast& ray, const vfloat& time, vfloat& dist, vbool& vmask) + { + if (likely(node.isAABBNodeMB())) vmask = intersectNodeK(node.getAABBNodeMB(), i, ray, time, dist); + else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK(node.ungetAABBNodeMB(), i, ray, time, dist); + return true; + } + }; + + template + struct BVHNNodeIntersectorK + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, + const TravRayKRobust& ray, const vfloat& time, vfloat& dist, vbool& vmask) + { + if (likely(node.isAABBNodeMB())) vmask = intersectNodeKRobust(node.getAABBNodeMB(), i, ray, time, dist); + else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK(node.ungetAABBNodeMB(), i, ray, time, dist); + return true; + } + }; + + template + struct BVHNNodeIntersectorK + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, + const TravRayKFast& ray, const vfloat& time, vfloat& dist, vbool& vmask) + { + vmask &= intersectNodeKMB4D(node, i, ray, time, dist); + return true; + } + }; + + template + struct BVHNNodeIntersectorK + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, + const TravRayKRobust& ray, const vfloat& time, vfloat& dist, vbool& vmask) + { + vmask &= intersectNodeKMB4DRobust(node, i, ray, time, dist); + return true; + } + }; + + template + struct BVHNNodeIntersectorK + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, + const TravRayKFast& ray, const vfloat& time, vfloat& dist, vbool& vmask) + { + if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) { + vmask &= intersectNodeKMB4D(node, i, ray, time, dist); + } else /*if (unlikely(node.isOBBNodeMB()))*/ { + assert(node.isOBBNodeMB()); + vmask &= intersectNodeK(node.ungetAABBNodeMB(), i, ray, time, dist); + } + return true; + } + }; + + template + struct BVHNNodeIntersectorK + { + static __forceinline bool intersect(const typename BVHN::NodeRef& node, size_t i, + const TravRayKRobust& ray, const vfloat& time, vfloat& dist, vbool& vmask) + { + if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) { + vmask &= intersectNodeKMB4DRobust(node, i, ray, time, dist); + } else /*if (unlikely(node.isOBBNodeMB()))*/ { + assert(node.isOBBNodeMB()); + vmask &= intersectNodeK(node.ungetAABBNodeMB(), i, ray, time, dist); + } + return true; + } + }; + + + /*! Intersects N nodes with K rays */ + template + struct BVHNQuantizedBaseNodeIntersectorK; + + template + struct BVHNQuantizedBaseNodeIntersectorK + { + static __forceinline vbool intersectK(const typename BVHN::QuantizedBaseNode* node, const size_t i, + const TravRayK& ray, vfloat& dist) + { + return intersectQuantizedNodeK(node,i,ray,dist); + } + + static __forceinline vbool intersectK(const typename BVHN::QuantizedBaseNodeMB* node, const size_t i, + const TravRayK& ray, const vfloat& time, vfloat& dist) + { + return intersectQuantizedNodeMBK(node,i,ray,time,dist); + } + + }; + + template + struct BVHNQuantizedBaseNodeIntersectorK + { + static __forceinline vbool intersectK(const typename BVHN::QuantizedBaseNode* node, const size_t i, + const TravRayK& ray, vfloat& dist) + { + return intersectQuantizedNodeK(node,i,ray,dist); + } + + static __forceinline vbool intersectK(const typename BVHN::QuantizedBaseNodeMB* node, const size_t i, + const TravRayK& ray, const vfloat& time, vfloat& dist) + { + return intersectQuantizedNodeMBK(node,i,ray,time,dist); + } + }; + + + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h new file mode 100644 index 00000000000..f379b57aeac --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h @@ -0,0 +1,215 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector.h" + +namespace embree +{ + namespace isa + { + ////////////////////////////////////////////////////////////////////////////////////// + // Ray packet structure used in stream traversal + ////////////////////////////////////////////////////////////////////////////////////// + + template + struct TravRayKStream; + + /* Fast variant */ + template + struct TravRayKStream + { + __forceinline TravRayKStream() {} + + __forceinline TravRayKStream(const Vec3vf& ray_org, const Vec3vf& ray_dir, const vfloat& ray_tnear, const vfloat& ray_tfar) + { + init(ray_org, ray_dir); + tnear = ray_tnear; + tfar = ray_tfar; + } + + __forceinline void init(const Vec3vf& ray_org, const Vec3vf& ray_dir) + { + rdir = rcp_safe(ray_dir); +#if defined(__aarch64__) + neg_org_rdir = -(ray_org * rdir); +#else + org_rdir = ray_org * rdir; +#endif + } + + Vec3vf rdir; +#if defined(__aarch64__) + Vec3vf neg_org_rdir; +#else + Vec3vf org_rdir; +#endif + vfloat tnear; + vfloat tfar; + }; + + template + using TravRayKStreamFast = TravRayKStream; + + /* Robust variant */ + template + struct TravRayKStream + { + __forceinline TravRayKStream() {} + + __forceinline TravRayKStream(const Vec3vf& ray_org, const Vec3vf& ray_dir, const vfloat& ray_tnear, const vfloat& ray_tfar) + { + init(ray_org, ray_dir); + tnear = ray_tnear; + tfar = ray_tfar; + } + + __forceinline void init(const Vec3vf& ray_org, const Vec3vf& ray_dir) + { + rdir = vfloat(1.0f)/(zero_fix(ray_dir)); + org = ray_org; + } + + Vec3vf rdir; + Vec3vf org; + vfloat tnear; + vfloat tfar; + }; + + template + using TravRayKStreamRobust = TravRayKStream; + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline size_t intersectNode1(const typename BVHN::AABBNode* __restrict__ node, + const TravRayKStreamFast& ray, size_t k, const NearFarPrecalculations& nf) + { + const vfloat bminX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearX)); + const vfloat bminY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearY)); + const vfloat bminZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearZ)); + const vfloat bmaxX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farX)); + const vfloat bmaxY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farY)); + const vfloat bmaxZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farZ)); + +#if defined (__aarch64__) + const vfloat rminX = madd(bminX, vfloat(ray.rdir.x[k]), vfloat(ray.neg_org_rdir.x[k])); + const vfloat rminY = madd(bminY, vfloat(ray.rdir.y[k]), vfloat(ray.neg_org_rdir.y[k])); + const vfloat rminZ = madd(bminZ, vfloat(ray.rdir.z[k]), vfloat(ray.neg_org_rdir.z[k])); + const vfloat rmaxX = madd(bmaxX, vfloat(ray.rdir.x[k]), vfloat(ray.neg_org_rdir.x[k])); + const vfloat rmaxY = madd(bmaxY, vfloat(ray.rdir.y[k]), vfloat(ray.neg_org_rdir.y[k])); + const vfloat rmaxZ = madd(bmaxZ, vfloat(ray.rdir.z[k]), vfloat(ray.neg_org_rdir.z[k])); +#else + const vfloat rminX = msub(bminX, vfloat(ray.rdir.x[k]), vfloat(ray.org_rdir.x[k])); + const vfloat rminY = msub(bminY, vfloat(ray.rdir.y[k]), vfloat(ray.org_rdir.y[k])); + const vfloat rminZ = msub(bminZ, vfloat(ray.rdir.z[k]), vfloat(ray.org_rdir.z[k])); + const vfloat rmaxX = msub(bmaxX, vfloat(ray.rdir.x[k]), vfloat(ray.org_rdir.x[k])); + const vfloat rmaxY = msub(bmaxY, vfloat(ray.rdir.y[k]), vfloat(ray.org_rdir.y[k])); + const vfloat rmaxZ = msub(bmaxZ, vfloat(ray.rdir.z[k]), vfloat(ray.org_rdir.z[k])); +#endif + const vfloat rmin = maxi(rminX, rminY, rminZ, vfloat(ray.tnear[k])); + const vfloat rmax = mini(rmaxX, rmaxY, rmaxZ, vfloat(ray.tfar[k])); + + const vbool vmask_first_hit = rmin <= rmax; + + return movemask(vmask_first_hit) & (((size_t)1 << N)-1); + } + + template + __forceinline size_t intersectNodeK(const typename BVHN::AABBNode* __restrict__ node, size_t i, + const TravRayKStreamFast& ray, const NearFarPrecalculations& nf) + { + char* ptr = (char*)&node->lower_x + i*sizeof(float); + const vfloat bminX = *(const float*)(ptr + nf.nearX); + const vfloat bminY = *(const float*)(ptr + nf.nearY); + const vfloat bminZ = *(const float*)(ptr + nf.nearZ); + const vfloat bmaxX = *(const float*)(ptr + nf.farX); + const vfloat bmaxY = *(const float*)(ptr + nf.farY); + const vfloat bmaxZ = *(const float*)(ptr + nf.farZ); + +#if defined (__aarch64__) + const vfloat rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x); + const vfloat rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y); + const vfloat rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z); + const vfloat rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x); + const vfloat rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y); + const vfloat rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z); +#endif + + const vfloat rmin = maxi(rminX, rminY, rminZ, ray.tnear); + const vfloat rmax = mini(rmaxX, rmaxY, rmaxZ, ray.tfar); + + const vbool vmask_first_hit = rmin <= rmax; + + return movemask(vmask_first_hit); + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template + __forceinline size_t intersectNode1(const typename BVHN::AABBNode* __restrict__ node, + const TravRayKStreamRobust& ray, size_t k, const NearFarPrecalculations& nf) + { + const vfloat bminX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearX)); + const vfloat bminY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearY)); + const vfloat bminZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.nearZ)); + const vfloat bmaxX = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farX)); + const vfloat bmaxY = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farY)); + const vfloat bmaxZ = vfloat(*(const vfloat*)((const char*)&node->lower_x + nf.farZ)); + + const vfloat rminX = (bminX - vfloat(ray.org.x[k])) * vfloat(ray.rdir.x[k]); + const vfloat rminY = (bminY - vfloat(ray.org.y[k])) * vfloat(ray.rdir.y[k]); + const vfloat rminZ = (bminZ - vfloat(ray.org.z[k])) * vfloat(ray.rdir.z[k]); + const vfloat rmaxX = (bmaxX - vfloat(ray.org.x[k])) * vfloat(ray.rdir.x[k]); + const vfloat rmaxY = (bmaxY - vfloat(ray.org.y[k])) * vfloat(ray.rdir.y[k]); + const vfloat rmaxZ = (bmaxZ - vfloat(ray.org.z[k])) * vfloat(ray.rdir.z[k]); + const float round_up = 1.0f+3.0f*float(ulp); // FIXME: use per instruction rounding for AVX512 + const vfloat rmin = max(rminX, rminY, rminZ, vfloat(ray.tnear[k])); + const vfloat rmax = round_up *min(rmaxX, rmaxY, rmaxZ, vfloat(ray.tfar[k])); + + const vbool vmask_first_hit = rmin <= rmax; + + return movemask(vmask_first_hit) & (((size_t)1 << N)-1); + } + + template + __forceinline size_t intersectNodeK(const typename BVHN::AABBNode* __restrict__ node, size_t i, + const TravRayKStreamRobust& ray, const NearFarPrecalculations& nf) + { + char *ptr = (char*)&node->lower_x + i*sizeof(float); + const vfloat bminX = *(const float*)(ptr + nf.nearX); + const vfloat bminY = *(const float*)(ptr + nf.nearY); + const vfloat bminZ = *(const float*)(ptr + nf.nearZ); + const vfloat bmaxX = *(const float*)(ptr + nf.farX); + const vfloat bmaxY = *(const float*)(ptr + nf.farY); + const vfloat bmaxZ = *(const float*)(ptr + nf.farZ); + + const vfloat rminX = (bminX - ray.org.x) * ray.rdir.x; + const vfloat rminY = (bminY - ray.org.y) * ray.rdir.y; + const vfloat rminZ = (bminZ - ray.org.z) * ray.rdir.z; + const vfloat rmaxX = (bmaxX - ray.org.x) * ray.rdir.x; + const vfloat rmaxY = (bmaxY - ray.org.y) * ray.rdir.y; + const vfloat rmaxZ = (bmaxZ - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const vfloat rmin = max(rminX, rminY, rminZ, vfloat(ray.tnear)); + const vfloat rmax = round_up * min(rmaxX, rmaxY, rmaxZ, vfloat(ray.tfar)); + + const vbool vmask_first_hit = rmin <= rmax; + + return movemask(vmask_first_hit); + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/accel.h b/thirdparty/embree-aarch64/kernels/common/accel.h new file mode 100644 index 00000000000..c038d3cf21e --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/accel.h @@ -0,0 +1,556 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "ray.h" +#include "point_query.h" +#include "context.h" + +namespace embree +{ + class Scene; + + /*! Base class for the acceleration structure data. */ + class AccelData : public RefCount + { + ALIGNED_CLASS_(16); + public: + enum Type { TY_UNKNOWN = 0, TY_ACCELN = 1, TY_ACCEL_INSTANCE = 2, TY_BVH4 = 3, TY_BVH8 = 4 }; + + public: + AccelData (const Type type) + : bounds(empty), type(type) {} + + /*! notifies the acceleration structure about the deletion of some geometry */ + virtual void deleteGeometry(size_t geomID) {}; + + /*! clears the acceleration structure data */ + virtual void clear() = 0; + + /*! returns normal bounds */ + __forceinline BBox3fa getBounds() const { + return bounds.bounds(); + } + + /*! returns bounds for some time */ + __forceinline BBox3fa getBounds(float t) const { + return bounds.interpolate(t); + } + + /*! returns linear bounds */ + __forceinline LBBox3fa getLinearBounds() const { + return bounds; + } + + /*! checks if acceleration structure is empty */ + __forceinline bool isEmpty() const { + return bounds.bounds0.lower.x == float(pos_inf); + } + + public: + LBBox3fa bounds; // linear bounds + Type type; + }; + + /*! Base class for all intersectable and buildable acceleration structures. */ + class Accel : public AccelData + { + ALIGNED_CLASS_(16); + public: + + struct Intersectors; + + /*! Type of collide function */ + typedef void (*CollideFunc)(void* bvh0, void* bvh1, RTCCollideFunc callback, void* userPtr); + + /*! Type of point query function */ + typedef bool(*PointQueryFunc)(Intersectors* This, /*!< this pointer to accel */ + PointQuery* query, /*!< point query for lookup */ + PointQueryContext* context); /*!< point query context */ + + /*! Type of intersect function pointer for single rays. */ + typedef void (*IntersectFunc)(Intersectors* This, /*!< this pointer to accel */ + RTCRayHit& ray, /*!< ray to intersect */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size 4. */ + typedef void (*IntersectFunc4)(const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRayHit4& ray, /*!< ray packet to intersect */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size 8. */ + typedef void (*IntersectFunc8)(const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRayHit8& ray, /*!< ray packet to intersect */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size 16. */ + typedef void (*IntersectFunc16)(const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRayHit16& ray, /*!< ray packet to intersect */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size N. */ + typedef void (*IntersectFuncN)(Intersectors* This, /*!< this pointer to accel */ + RTCRayHitN** ray, /*!< ray stream to intersect */ + const size_t N, /*!< number of rays in stream */ + IntersectContext* context /*!< layout flags */); + + + /*! Type of occlusion function pointer for single rays. */ + typedef void (*OccludedFunc) (Intersectors* This, /*!< this pointer to accel */ + RTCRay& ray, /*!< ray to test occlusion */ + IntersectContext* context); + + /*! Type of occlusion function pointer for ray packets of size 4. */ + typedef void (*OccludedFunc4) (const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRay4& ray, /*!< ray packet to test occlusion. */ + IntersectContext* context); + + /*! Type of occlusion function pointer for ray packets of size 8. */ + typedef void (*OccludedFunc8) (const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRay8& ray, /*!< ray packet to test occlusion. */ + IntersectContext* context); + + /*! Type of occlusion function pointer for ray packets of size 16. */ + typedef void (*OccludedFunc16) (const void* valid, /*!< pointer to valid mask */ + Intersectors* This, /*!< this pointer to accel */ + RTCRay16& ray, /*!< ray packet to test occlusion. */ + IntersectContext* context); + + /*! Type of intersect function pointer for ray packets of size N. */ + typedef void (*OccludedFuncN)(Intersectors* This, /*!< this pointer to accel */ + RTCRayN** ray, /*!< ray stream to test occlusion */ + const size_t N, /*!< number of rays in stream */ + IntersectContext* context /*!< layout flags */); + typedef void (*ErrorFunc) (); + + struct Collider + { + Collider (ErrorFunc error = nullptr) + : collide((CollideFunc)error), name(nullptr) {} + + Collider (CollideFunc collide, const char* name) + : collide(collide), name(name) {} + + operator bool() const { return name; } + + public: + CollideFunc collide; + const char* name; + }; + + struct Intersector1 + { + Intersector1 (ErrorFunc error = nullptr) + : intersect((IntersectFunc)error), occluded((OccludedFunc)error), name(nullptr) {} + + Intersector1 (IntersectFunc intersect, OccludedFunc occluded, const char* name) + : intersect(intersect), occluded(occluded), pointQuery(nullptr), name(name) {} + + Intersector1 (IntersectFunc intersect, OccludedFunc occluded, PointQueryFunc pointQuery, const char* name) + : intersect(intersect), occluded(occluded), pointQuery(pointQuery), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFunc intersect; + OccludedFunc occluded; + PointQueryFunc pointQuery; + const char* name; + }; + + struct Intersector4 + { + Intersector4 (ErrorFunc error = nullptr) + : intersect((IntersectFunc4)error), occluded((OccludedFunc4)error), name(nullptr) {} + + Intersector4 (IntersectFunc4 intersect, OccludedFunc4 occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFunc4 intersect; + OccludedFunc4 occluded; + const char* name; + }; + + struct Intersector8 + { + Intersector8 (ErrorFunc error = nullptr) + : intersect((IntersectFunc8)error), occluded((OccludedFunc8)error), name(nullptr) {} + + Intersector8 (IntersectFunc8 intersect, OccludedFunc8 occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFunc8 intersect; + OccludedFunc8 occluded; + const char* name; + }; + + struct Intersector16 + { + Intersector16 (ErrorFunc error = nullptr) + : intersect((IntersectFunc16)error), occluded((OccludedFunc16)error), name(nullptr) {} + + Intersector16 (IntersectFunc16 intersect, OccludedFunc16 occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFunc16 intersect; + OccludedFunc16 occluded; + const char* name; + }; + + struct IntersectorN + { + IntersectorN (ErrorFunc error = nullptr) + : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {} + + IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFuncN intersect; + OccludedFuncN occluded; + const char* name; + }; + + struct Intersectors + { + Intersectors() + : ptr(nullptr), leafIntersector(nullptr), collider(nullptr), intersector1(nullptr), intersector4(nullptr), intersector8(nullptr), intersector16(nullptr), intersectorN(nullptr) {} + + Intersectors (ErrorFunc error) + : ptr(nullptr), leafIntersector(nullptr), collider(error), intersector1(error), intersector4(error), intersector8(error), intersector16(error), intersectorN(error) {} + + void print(size_t ident) + { + if (collider.name) { + for (size_t i=0; ibuild(); + bounds = accel->bounds; + } + + void deleteGeometry(size_t geomID) { + if (accel ) accel->deleteGeometry(geomID); + if (builder) builder->deleteGeometry(geomID); + } + + void clear() { + if (accel) accel->clear(); + if (builder) builder->clear(); + } + + private: + std::unique_ptr accel; + std::unique_ptr builder; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/acceln.cpp b/thirdparty/embree-aarch64/kernels/common/acceln.cpp new file mode 100644 index 00000000000..aadb4a64efb --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/acceln.cpp @@ -0,0 +1,232 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "acceln.h" +#include "ray.h" +#include "../../include/embree3/rtcore_ray.h" +#include "../../common/algorithms/parallel_for.h" + +namespace embree +{ + AccelN::AccelN() + : Accel(AccelData::TY_ACCELN), accels() {} + + AccelN::~AccelN() + { + for (size_t i=0; iptr; + for (size_t i=0; iaccels.size(); i++) + if (!This->accels[i]->isEmpty()) + changed |= This->accels[i]->intersectors.pointQuery(query,context); + return changed; + } + + void AccelN::intersect (Accel::Intersectors* This_in, RTCRayHit& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; iaccels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersect(ray,context); + } + + void AccelN::intersect4 (const void* valid, Accel::Intersectors* This_in, RTCRayHit4& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; iaccels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersect4(valid,ray,context); + } + + void AccelN::intersect8 (const void* valid, Accel::Intersectors* This_in, RTCRayHit8& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; iaccels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersect8(valid,ray,context); + } + + void AccelN::intersect16 (const void* valid, Accel::Intersectors* This_in, RTCRayHit16& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; iaccels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersect16(valid,ray,context); + } + + void AccelN::intersectN (Accel::Intersectors* This_in, RTCRayHitN** ray, const size_t N, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; iaccels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.intersectN(ray,N,context); + } + + void AccelN::occluded (Accel::Intersectors* This_in, RTCRay& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; iaccels.size(); i++) { + if (This->accels[i]->isEmpty()) continue; + This->accels[i]->intersectors.occluded(ray,context); + if (ray.tfar < 0.0f) break; + } + } + + void AccelN::occluded4 (const void* valid, Accel::Intersectors* This_in, RTCRay4& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; iaccels.size(); i++) { + if (This->accels[i]->isEmpty()) continue; + This->accels[i]->intersectors.occluded4(valid,ray,context); +#if defined(__SSE2__) || defined(__ARM_NEON) + vbool4 valid0 = asBool(((vint4*)valid)[0]); + vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); + if (unlikely(none(valid0 & hit0))) break; +#endif + } + } + + void AccelN::occluded8 (const void* valid, Accel::Intersectors* This_in, RTCRay8& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; iaccels.size(); i++) { + if (This->accels[i]->isEmpty()) continue; + This->accels[i]->intersectors.occluded8(valid,ray,context); +#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA + vbool4 valid0 = asBool(((vint4*)valid)[0]); + vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); + vbool4 valid1 = asBool(((vint4*)valid)[1]); + vbool4 hit1 = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero); + if (unlikely((none((valid0 & hit0) | (valid1 & hit1))))) break; +#endif + } + } + + void AccelN::occluded16 (const void* valid, Accel::Intersectors* This_in, RTCRay16& ray, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + for (size_t i=0; iaccels.size(); i++) { + if (This->accels[i]->isEmpty()) continue; + This->accels[i]->intersectors.occluded16(valid,ray,context); +#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA + vbool4 valid0 = asBool(((vint4*)valid)[0]); + vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); + vbool4 valid1 = asBool(((vint4*)valid)[1]); + vbool4 hit1 = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero); + vbool4 valid2 = asBool(((vint4*)valid)[2]); + vbool4 hit2 = ((vfloat4*)ray.tfar)[2] >= vfloat4(zero); + vbool4 valid3 = asBool(((vint4*)valid)[3]); + vbool4 hit3 = ((vfloat4*)ray.tfar)[3] >= vfloat4(zero); + if (unlikely((none((valid0 & hit0) | (valid1 & hit1) | (valid2 & hit2) | (valid3 & hit3))))) break; +#endif + } + } + + void AccelN::occludedN (Accel::Intersectors* This_in, RTCRayN** ray, const size_t N, IntersectContext* context) + { + AccelN* This = (AccelN*)This_in->ptr; + size_t M = N; + for (size_t i=0; iaccels.size(); i++) + if (!This->accels[i]->isEmpty()) + This->accels[i]->intersectors.occludedN(ray,M,context); + } + + void AccelN::accels_print(size_t ident) + { + for (size_t i=0; iintersectors.print(ident+2); + } + } + + void AccelN::accels_immutable() + { + for (size_t i=0; iimmutable(); + } + + void AccelN::accels_build () + { + /* reduce memory consumption */ + accels.shrink_to_fit(); + + /* build all acceleration structures in parallel */ + parallel_for (accels.size(), [&] (size_t i) { + accels[i]->build(); + }); + + /* create list of non-empty acceleration structures */ + bool valid1 = true; + bool valid4 = true; + bool valid8 = true; + bool valid16 = true; + for (size_t i=0; iintersectors.intersector1; + valid4 &= (bool) accels[i]->intersectors.intersector4; + valid8 &= (bool) accels[i]->intersectors.intersector8; + valid16 &= (bool) accels[i]->intersectors.intersector16; + } + + if (accels.size() == 1) { + type = accels[0]->type; // FIXME: should just assign entire Accel + bounds = accels[0]->bounds; + intersectors = accels[0]->intersectors; + } + else + { + type = AccelData::TY_ACCELN; + intersectors.ptr = this; + intersectors.intersector1 = Intersector1(&intersect,&occluded,&pointQuery,valid1 ? "AccelN::intersector1": nullptr); + intersectors.intersector4 = Intersector4(&intersect4,&occluded4,valid4 ? "AccelN::intersector4" : nullptr); + intersectors.intersector8 = Intersector8(&intersect8,&occluded8,valid8 ? "AccelN::intersector8" : nullptr); + intersectors.intersector16 = Intersector16(&intersect16,&occluded16,valid16 ? "AccelN::intersector16": nullptr); + intersectors.intersectorN = IntersectorN(&intersectN,&occludedN,"AccelN::intersectorN"); + + /*! calculate bounds */ + bounds = empty; + for (size_t i=0; ibounds); + } + } + + void AccelN::accels_select(bool filter) + { + for (size_t i=0; iintersectors.select(filter); + } + + void AccelN::accels_deleteGeometry(size_t geomID) + { + for (size_t i=0; ideleteGeometry(geomID); + } + + void AccelN::accels_clear() + { + for (size_t i=0; iclear(); + } + } +} + diff --git a/thirdparty/embree-aarch64/kernels/common/acceln.h b/thirdparty/embree-aarch64/kernels/common/acceln.h new file mode 100644 index 00000000000..2edd98f647d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/acceln.h @@ -0,0 +1,49 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "accel.h" + +namespace embree +{ + /*! merges N acceleration structures together, by processing them in order */ + class AccelN : public Accel + { + public: + AccelN (); + ~AccelN(); + + public: + void accels_add(Accel* accel); + void accels_init(); + + public: + static bool pointQuery (Accel::Intersectors* This, PointQuery* query, PointQueryContext* context); + + public: + static void intersect (Accel::Intersectors* This, RTCRayHit& ray, IntersectContext* context); + static void intersect4 (const void* valid, Accel::Intersectors* This, RTCRayHit4& ray, IntersectContext* context); + static void intersect8 (const void* valid, Accel::Intersectors* This, RTCRayHit8& ray, IntersectContext* context); + static void intersect16 (const void* valid, Accel::Intersectors* This, RTCRayHit16& ray, IntersectContext* context); + static void intersectN (Accel::Intersectors* This, RTCRayHitN** ray, const size_t N, IntersectContext* context); + + public: + static void occluded (Accel::Intersectors* This, RTCRay& ray, IntersectContext* context); + static void occluded4 (const void* valid, Accel::Intersectors* This, RTCRay4& ray, IntersectContext* context); + static void occluded8 (const void* valid, Accel::Intersectors* This, RTCRay8& ray, IntersectContext* context); + static void occluded16 (const void* valid, Accel::Intersectors* This, RTCRay16& ray, IntersectContext* context); + static void occludedN (Accel::Intersectors* This, RTCRayN** ray, const size_t N, IntersectContext* context); + + public: + void accels_print(size_t ident); + void accels_immutable(); + void accels_build (); + void accels_select(bool filter); + void accels_deleteGeometry(size_t geomID); + void accels_clear (); + + public: + std::vector accels; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/accelset.cpp b/thirdparty/embree-aarch64/kernels/common/accelset.cpp new file mode 100644 index 00000000000..79be1c4301b --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/accelset.cpp @@ -0,0 +1,17 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "accelset.h" +#include "scene.h" + +namespace embree +{ + AccelSet::AccelSet (Device* device, Geometry::GType gtype, size_t numItems, size_t numTimeSteps) + : Geometry(device,gtype,(unsigned int)numItems,(unsigned int)numTimeSteps), boundsFunc(nullptr) {} + + AccelSet::IntersectorN::IntersectorN (ErrorFunc error) + : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {} + + AccelSet::IntersectorN::IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name) + : intersect(intersect), occluded(occluded), name(name) {} +} diff --git a/thirdparty/embree-aarch64/kernels/common/accelset.h b/thirdparty/embree-aarch64/kernels/common/accelset.h new file mode 100644 index 00000000000..3774b2accb0 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/accelset.h @@ -0,0 +1,248 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "builder.h" +#include "geometry.h" +#include "ray.h" +#include "hit.h" + +namespace embree +{ + struct IntersectFunctionNArguments; + struct OccludedFunctionNArguments; + + typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args); + typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args); + + struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments + { + IntersectContext* internal_context; + Geometry* geometry; + ReportIntersectionFunc report; + }; + + struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments + { + IntersectContext* internal_context; + Geometry* geometry; + ReportOcclusionFunc report; + }; + + /*! Base class for set of acceleration structures. */ + class AccelSet : public Geometry + { + public: + typedef RTCIntersectFunctionN IntersectFuncN; + typedef RTCOccludedFunctionN OccludedFuncN; + typedef void (*ErrorFunc) (); + + struct IntersectorN + { + IntersectorN (ErrorFunc error = nullptr) ; + IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name); + + operator bool() const { return name; } + + public: + static const char* type; + IntersectFuncN intersect; + OccludedFuncN occluded; + const char* name; + }; + + public: + + /*! construction */ + AccelSet (Device* device, Geometry::GType gtype, size_t items, size_t numTimeSteps); + + /*! makes the acceleration structure immutable */ + virtual void immutable () {} + + /*! build accel */ + virtual void build () = 0; + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range& itime_range) const + { + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + if (!isvalid_non_empty(bounds(i,itime))) return false; + + return true; + } + + /*! Calculates the bounds of an item */ + __forceinline BBox3fa bounds(size_t i, size_t itime = 0) const + { + BBox3fa box; + assert(i < size()); + RTCBoundsFunctionArguments args; + args.geometryUserPtr = userPtr; + args.primID = (unsigned int)i; + args.timeStep = (unsigned int)itime; + args.bounds_o = (RTCBounds*)&box; + boundsFunc(&args); + return box; + } + + /*! calculates the linear bounds of the i'th item at the itime'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const + { + BBox3fa box[2]; + assert(i < size()); + RTCBoundsFunctionArguments args; + args.geometryUserPtr = userPtr; + args.primID = (unsigned int)i; + args.timeStep = (unsigned int)(itime+0); + args.bounds_o = (RTCBounds*)&box[0]; + boundsFunc(&args); + args.timeStep = (unsigned int)(itime+1); + args.bounds_o = (RTCBounds*)&box[1]; + boundsFunc(&args); + return LBBox3fa(box[0],box[1]); + } + + /*! calculates the build bounds of the i'th item, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const + { + const BBox3fa b = bounds(i); + if (bbox) *bbox = b; + return isvalid_non_empty(b); + } + + /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + const LBBox3fa bounds = linearBounds(i,itime); + bbox = bounds.bounds0; // use bounding box of first timestep to build BVH + return isvalid_non_empty(bounds); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const { + if (!valid(i, timeSegmentRange(time_range))) return false; + bbox = linearBounds(i, time_range); + return true; + } + + /* gets version info of topology */ + unsigned int getTopologyVersion() const { + return numPrimitives; + } + + /* returns true if topology changed */ + bool topologyChanged(unsigned int otherVersion) const { + return numPrimitives != otherVersion; + } + + public: + + /*! Intersects a single ray with the scene. */ + __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) + { + assert(primID < size()); + assert(intersectorN.intersect); + + int mask = -1; + IntersectFunctionNArguments args; + args.valid = &mask; + args.geometryUserPtr = userPtr; + args.context = context->user; + args.rayhit = (RTCRayHitN*)&ray; + args.N = 1; + args.geomID = geomID; + args.primID = primID; + args.internal_context = context; + args.geometry = this; + args.report = report; + + intersectorN.intersect(&args); + } + + /*! Tests if single ray is occluded by the scene. */ + __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report) + { + assert(primID < size()); + assert(intersectorN.occluded); + + int mask = -1; + OccludedFunctionNArguments args; + args.valid = &mask; + args.geometryUserPtr = userPtr; + args.context = context->user; + args.ray = (RTCRayN*)&ray; + args.N = 1; + args.geomID = geomID; + args.primID = primID; + args.internal_context = context; + args.geometry = this; + args.report = report; + + intersectorN.occluded(&args); + } + + /*! Intersects a packet of K rays with the scene. */ + template + __forceinline void intersect (const vbool& valid, RayHitK& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) + { + assert(primID < size()); + assert(intersectorN.intersect); + + vint mask = valid.mask32(); + IntersectFunctionNArguments args; + args.valid = (int*)&mask; + args.geometryUserPtr = userPtr; + args.context = context->user; + args.rayhit = (RTCRayHitN*)&ray; + args.N = K; + args.geomID = geomID; + args.primID = primID; + args.internal_context = context; + args.geometry = this; + args.report = report; + + intersectorN.intersect(&args); + } + + /*! Tests if a packet of K rays is occluded by the scene. */ + template + __forceinline void occluded (const vbool& valid, RayK& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report) + { + assert(primID < size()); + assert(intersectorN.occluded); + + vint mask = valid.mask32(); + OccludedFunctionNArguments args; + args.valid = (int*)&mask; + args.geometryUserPtr = userPtr; + args.context = context->user; + args.ray = (RTCRayN*)&ray; + args.N = K; + args.geomID = geomID; + args.primID = primID; + args.internal_context = context; + args.geometry = this; + args.report = report; + + intersectorN.occluded(&args); + } + + public: + RTCBoundsFunction boundsFunc; + IntersectorN intersectorN; + }; + +#define DEFINE_SET_INTERSECTORN(symbol,intersector) \ + AccelSet::IntersectorN symbol() { \ + return AccelSet::IntersectorN(intersector::intersect, \ + intersector::occluded, \ + TOSTRING(isa) "::" TOSTRING(symbol)); \ + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/alloc.cpp b/thirdparty/embree-aarch64/kernels/common/alloc.cpp new file mode 100644 index 00000000000..6fa406f03a6 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/alloc.cpp @@ -0,0 +1,82 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "alloc.h" +#include "../../common/sys/thread.h" +#if defined(__aarch64__) && defined(BUILD_IOS) +#include "../../common/sys/barrier.h" +#endif + +namespace embree +{ + __thread FastAllocator::ThreadLocal2* FastAllocator::thread_local_allocator2 = nullptr; + SpinLock FastAllocator::s_thread_local_allocators_lock; + std::vector> FastAllocator::s_thread_local_allocators; + + struct fast_allocator_regression_test : public RegressionTest + { + BarrierSys barrier; + std::atomic numFailed; + std::unique_ptr alloc; + + fast_allocator_regression_test() + : RegressionTest("fast_allocator_regression_test"), numFailed(0) + { + registerRegressionTest(this); + } + + static void thread_alloc(fast_allocator_regression_test* This) + { + FastAllocator::CachedAllocator threadalloc = This->alloc->getCachedAllocator(); + + size_t* ptrs[1000]; + for (size_t j=0; j<1000; j++) + { + This->barrier.wait(); + for (size_t i=0; i<1000; i++) { + ptrs[i] = (size_t*) threadalloc.malloc0(sizeof(size_t)+(i%32)); + *ptrs[i] = size_t(threadalloc.talloc0) + i; + } + for (size_t i=0; i<1000; i++) { + if (*ptrs[i] != size_t(threadalloc.talloc0) + i) + This->numFailed++; + } + This->barrier.wait(); + } + } + + bool run () + { + alloc = make_unique(new FastAllocator(nullptr,false)); + numFailed.store(0); + + size_t numThreads = getNumberOfLogicalThreads(); + barrier.init(numThreads+1); + + /* create threads */ + std::vector threads; + for (size_t i=0; ireset(); + barrier.wait(); + barrier.wait(); + } + + /* destroy threads */ + for (size_t i=0; i +#endif + +namespace embree +{ + class FastAllocator + { + /*! maximum supported alignment */ + static const size_t maxAlignment = 64; + + /*! maximum allocation size */ + + /* default settings */ + //static const size_t defaultBlockSize = 4096; +#define maxAllocationSize size_t(2*1024*1024-maxAlignment) + + static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8; + + public: + + struct ThreadLocal2; + enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE }; + + /*! Per thread structure holding the current memory block. */ + struct __aligned(64) ThreadLocal + { + ALIGNED_CLASS_(64); + public: + + /*! Constructor for usage with ThreadLocalData */ + __forceinline ThreadLocal (ThreadLocal2* parent) + : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {} + + /*! initialize allocator */ + void init(FastAllocator* alloc) + { + ptr = nullptr; + cur = end = 0; + bytesUsed = 0; + bytesWasted = 0; + allocBlockSize = 0; + if (alloc) allocBlockSize = alloc->defaultBlockSize; + } + + /* Allocate aligned memory from the threads memory block. */ + __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16) + { + /* bind the thread local allocator to the proper FastAllocator*/ + parent->bind(alloc); + + assert(align <= maxAlignment); + bytesUsed += bytes; + + /* try to allocate in local block */ + size_t ofs = (align - cur) & (align-1); + cur += bytes + ofs; + if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } + cur -= bytes + ofs; + + /* if allocation is too large allocate with parent allocator */ + if (4*bytes > allocBlockSize) { + return alloc->malloc(bytes,maxAlignment,false); + } + + /* get new partial block if allocation failed */ + size_t blockSize = allocBlockSize; + ptr = (char*) alloc->malloc(blockSize,maxAlignment,true); + bytesWasted += end-cur; + cur = 0; end = blockSize; + + /* retry allocation */ + ofs = (align - cur) & (align-1); + cur += bytes + ofs; + if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } + cur -= bytes + ofs; + + /* get new full block if allocation failed */ + blockSize = allocBlockSize; + ptr = (char*) alloc->malloc(blockSize,maxAlignment,false); + bytesWasted += end-cur; + cur = 0; end = blockSize; + + /* retry allocation */ + ofs = (align - cur) & (align-1); + cur += bytes + ofs; + if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } + cur -= bytes + ofs; + + /* should never happen as large allocations get handled specially above */ + assert(false); + return nullptr; + } + + + /*! returns amount of used bytes */ + __forceinline size_t getUsedBytes() const { return bytesUsed; } + + /*! returns amount of free bytes */ + __forceinline size_t getFreeBytes() const { return end-cur; } + + /*! returns amount of wasted bytes */ + __forceinline size_t getWastedBytes() const { return bytesWasted; } + + private: + ThreadLocal2* parent; + char* ptr; //!< pointer to memory block + size_t cur; //!< current location of the allocator + size_t end; //!< end of the memory block + size_t allocBlockSize; //!< block size for allocations + size_t bytesUsed; //!< number of total bytes allocated + size_t bytesWasted; //!< number of bytes wasted + }; + + /*! Two thread local structures. */ + struct __aligned(64) ThreadLocal2 + { + ALIGNED_CLASS_(64); + public: + + __forceinline ThreadLocal2() + : alloc(nullptr), alloc0(this), alloc1(this) {} + + /*! bind to fast allocator */ + __forceinline void bind(FastAllocator* alloc_i) + { + assert(alloc_i); + if (alloc.load() == alloc_i) return; +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(mutex); +#else + Lock lock(mutex); +#endif + //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind + if (alloc.load()) { + alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); + alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); + alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes(); + } + alloc0.init(alloc_i); + alloc1.init(alloc_i); + alloc.store(alloc_i); + alloc_i->join(this); + } + + /*! unbind to fast allocator */ + void unbind(FastAllocator* alloc_i) + { + assert(alloc_i); + if (alloc.load() != alloc_i) return; +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(mutex); +#else + Lock lock(mutex); +#endif + if (alloc.load() != alloc_i) return; // required as a different thread calls unbind + alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); + alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); + alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes(); + alloc0.init(nullptr); + alloc1.init(nullptr); + alloc.store(nullptr); + } + + public: +#if defined(__aarch64__) && defined(BUILD_IOS) + std::mutex mutex; +#else + SpinLock mutex; //!< required as unbind is called from other threads +#endif + std::atomic alloc; //!< parent allocator + ThreadLocal alloc0; + ThreadLocal alloc1; + }; + + FastAllocator (Device* device, bool osAllocation) + : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0), + growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC), + primrefarray(device,0) + { + for (size_t i=0; i& primrefarray_i) { + primrefarray = std::move(primrefarray_i); + } + + void unshare(mvector& primrefarray_o) + { + reset(); // this removes blocks that are allocated inside the shared primref array + primrefarray_o = std::move(primrefarray); + } + + /*! returns first fast thread local allocator */ + __forceinline ThreadLocal* _threadLocal() { + return &threadLocal2()->alloc0; + } + + void setOSallocation(bool flag) + { + atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC; + } + + private: + + /*! returns both fast thread local allocators */ + __forceinline ThreadLocal2* threadLocal2() + { + ThreadLocal2* alloc = thread_local_allocator2; + if (alloc == nullptr) { + thread_local_allocator2 = alloc = new ThreadLocal2; +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(s_thread_local_allocators_lock); +#else + Lock lock(s_thread_local_allocators_lock); +#endif + s_thread_local_allocators.push_back(make_unique(alloc)); + } + return alloc; + } + + public: + + __forceinline void join(ThreadLocal2* alloc) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(s_thread_local_allocators_lock); +#else + Lock lock(thread_local_allocators_lock); +#endif + thread_local_allocators.push_back(alloc); + } + + public: + + struct CachedAllocator + { + __forceinline CachedAllocator(void* ptr) + : alloc(nullptr), talloc0(nullptr), talloc1(nullptr) + { + assert(ptr == nullptr); + } + + __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc) + : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {} + + __forceinline operator bool () const { + return alloc != nullptr; + } + + __forceinline void* operator() (size_t bytes, size_t align = 16) const { + return talloc0->malloc(alloc,bytes,align); + } + + __forceinline void* malloc0 (size_t bytes, size_t align = 16) const { + return talloc0->malloc(alloc,bytes,align); + } + + __forceinline void* malloc1 (size_t bytes, size_t align = 16) const { + return talloc1->malloc(alloc,bytes,align); + } + + public: + FastAllocator* alloc; + ThreadLocal* talloc0; + ThreadLocal* talloc1; + }; + + __forceinline CachedAllocator getCachedAllocator() { + return CachedAllocator(this,threadLocal2()); + } + + /*! Builder interface to create thread local allocator */ + struct Create + { + public: + __forceinline Create (FastAllocator* allocator) : allocator(allocator) {} + __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); } + + private: + FastAllocator* allocator; + }; + + void internal_fix_used_blocks() + { + /* move thread local blocks to global block list */ + for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++) + { + while (threadBlocks[i].load() != nullptr) { + Block* nextUsedBlock = threadBlocks[i].load()->next; + threadBlocks[i].load()->next = usedBlocks.load(); + usedBlocks = threadBlocks[i].load(); + threadBlocks[i] = nextUsedBlock; + } + threadBlocks[i] = nullptr; + } + } + + static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks +#if defined(__AVX512ER__) // KNL + static const size_t mainAllocOverheadStatic = 15; //! 15 means 7.5% allocation overhead through unfilled main alloc blocks +#else + static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks +#endif + static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks + + /* calculates a single threaded threshold for the builders such + * that for small scenes the overhead of partly allocated blocks + * per thread is low */ + size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated) + { + if (numPrimitives == 0 || bytesEstimated == 0) + return defaultThreshold; + + /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */ + const size_t single_mode_factor = use_single_mode ? 1 : 2; + const size_t threadCount = TaskScheduler::threadCount(); + const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize; + + /* if we do not have to limit number of threads use optimal thresdhold */ + if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount) + return defaultThreshold; + + /* otherwise limit number of threads by calculating proper single thread threshold */ + else { + double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives); + return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive)); + } + } + + __forceinline size_t alignSize(size_t i) { + return (i+127)/128*128; + } + + /*! initializes the grow size */ + __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast) + { + /* we do not need single thread local allocator mode */ + use_single_mode = false; + + /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */ + size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic; + size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead); + growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize); + + /* if we reached the maxAllocationSize for growSize, we can + * increase the number of allocation slots by still guaranteeing + * the mainAllocationOverhead */ + slotMask = 0x0; + + if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1; + if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3; + if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7; + if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */ + + /* set the thread local alloc block size */ + size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment; + + /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */ +#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL + const size_t threadCount = TaskScheduler::threadCount(); + const size_t single_mode_factor = use_single_mode ? 1 : 2; + const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch; + if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount) + defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize); + + /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */ + else +#endif + defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch); + + if (bytesEstimated == 0) { + maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size + defaultBlockSize = defaultBlockSizeSwitch; + } + log2_grow_size_scale = 0; + + if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size; + if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0; + if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1; + if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3; + if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7; + if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size; + if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc; + } + + /*! initializes the allocator */ + void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate) + { + internal_fix_used_blocks(); + /* distribute the allocation to multiple thread block slots */ + slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove + if (usedBlocks.load() || freeBlocks.load()) { reset(); return; } + if (bytesReserve == 0) bytesReserve = bytesAllocate; + freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype); + estimatedSize = bytesEstimate; + initGrowSizeAndNumSlots(bytesEstimate,true); + } + + /*! initializes the allocator */ + void init_estimate(size_t bytesEstimate) + { + internal_fix_used_blocks(); + if (usedBlocks.load() || freeBlocks.load()) { reset(); return; } + /* single allocator mode ? */ + estimatedSize = bytesEstimate; + //initGrowSizeAndNumSlots(bytesEstimate,false); + initGrowSizeAndNumSlots(bytesEstimate,false); + + } + + /*! frees state not required after build */ + __forceinline void cleanup() + { + internal_fix_used_blocks(); + + /* unbind all thread local allocators */ + for (auto alloc : thread_local_allocators) alloc->unbind(this); + thread_local_allocators.clear(); + } + + /*! resets the allocator, memory blocks get reused */ + void reset () + { + internal_fix_used_blocks(); + + bytesUsed.store(0); + bytesFree.store(0); + bytesWasted.store(0); + + /* reset all used blocks and move them to begin of free block list */ + while (usedBlocks.load() != nullptr) { + usedBlocks.load()->reset_block(); + Block* nextUsedBlock = usedBlocks.load()->next; + usedBlocks.load()->next = freeBlocks.load(); + freeBlocks = usedBlocks.load(); + usedBlocks = nextUsedBlock; + } + + /* remove all shared blocks as they are re-added during build */ + freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load())); + + for (size_t i=0; iunbind(this); + thread_local_allocators.clear(); + } + + /*! frees all allocated memory */ + __forceinline void clear() + { + cleanup(); + bytesUsed.store(0); + bytesFree.store(0); + bytesWasted.store(0); + if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr; + if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr; + for (size_t i=0; imalloc(device,bytes,align,partial); + if (ptr) return ptr; + } + + /* throw error if allocation is too large */ + if (bytes > maxAllocationSize) + throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large"); + + /* parallel block creation in case of no freeBlocks, avoids single global mutex */ + if (likely(freeBlocks.load() == nullptr)) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(slotMutex[slot]); +#else + Lock lock(slotMutex[slot]); +#endif + if (myUsedBlocks == threadUsedBlocks[slot]) { + const size_t alignedBytes = (bytes+(align-1)) & ~(align-1); + const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes); + assert(allocSize >= bytes); + threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here! + // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail. + } + continue; + } + + /* if this fails allocate new block */ + { +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(mutex); +#else + Lock lock(mutex); +#endif + if (myUsedBlocks == threadUsedBlocks[slot]) + { + if (freeBlocks.load() != nullptr) { + Block* nextFreeBlock = freeBlocks.load()->next; + freeBlocks.load()->next = usedBlocks; + __memory_barrier(); + usedBlocks = freeBlocks.load(); + threadUsedBlocks[slot] = freeBlocks.load(); + freeBlocks = nextFreeBlock; + } else { + const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize); + usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above! + } + } + } + } + } + + /*! add new block */ + void addBlock(void* ptr, ssize_t bytes) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(mutex); +#else + Lock lock(mutex); +#endif + const size_t sizeof_Header = offsetof(Block,data[0]); + void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1)); + size_t ofs = (size_t) aptr - (size_t) ptr; + bytes -= ofs; + if (bytes < 4096) return; // ignore empty or very small blocks + freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs); + } + + /* special allocation only used from morton builder only a single time for each build */ + void* specialAlloc(size_t bytes) + { + assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes); + return freeBlocks.load()->ptr(); + } + + struct Statistics + { + Statistics () + : bytesUsed(0), bytesFree(0), bytesWasted(0) {} + + Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted) + : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {} + + Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false) + : bytesUsed(0), bytesFree(0), bytesWasted(0) + { + Block* usedBlocks = alloc->usedBlocks.load(); + Block* freeBlocks = alloc->freeBlocks.load(); + if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages); + if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages); + if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages); + if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages); + if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages); + } + + std::string str(size_t numPrimitives) + { + std::stringstream str; + str.setf(std::ios::fixed, std::ios::floatfield); + str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " + << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, " + << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, " + << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, " + << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives); + return str.str(); + } + + friend Statistics operator+ ( const Statistics& a, const Statistics& b) + { + return Statistics(a.bytesUsed+b.bytesUsed, + a.bytesFree+b.bytesFree, + a.bytesWasted+b.bytesWasted); + } + + size_t bytesAllocatedTotal() const { + return bytesUsed + bytesFree + bytesWasted; + } + + public: + size_t bytesUsed; + size_t bytesFree; + size_t bytesWasted; + }; + + Statistics getStatistics(AllocationType atype, bool huge_pages = false) { + return Statistics(this,atype,huge_pages); + } + + size_t getUsedBytes() { + return bytesUsed; + } + + size_t getWastedBytes() { + return bytesWasted; + } + + struct AllStatistics + { + AllStatistics (FastAllocator* alloc) + + : bytesUsed(alloc->bytesUsed), + bytesFree(alloc->bytesFree), + bytesWasted(alloc->bytesWasted), + stat_all(alloc,ANY_TYPE), + stat_malloc(alloc,ALIGNED_MALLOC), + stat_4K(alloc,EMBREE_OS_MALLOC,false), + stat_2M(alloc,EMBREE_OS_MALLOC,true), + stat_shared(alloc,SHARED) {} + + AllStatistics (size_t bytesUsed, + size_t bytesFree, + size_t bytesWasted, + Statistics stat_all, + Statistics stat_malloc, + Statistics stat_4K, + Statistics stat_2M, + Statistics stat_shared) + + : bytesUsed(bytesUsed), + bytesFree(bytesFree), + bytesWasted(bytesWasted), + stat_all(stat_all), + stat_malloc(stat_malloc), + stat_4K(stat_4K), + stat_2M(stat_2M), + stat_shared(stat_shared) {} + + friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b) + { + return AllStatistics(a.bytesUsed+b.bytesUsed, + a.bytesFree+b.bytesFree, + a.bytesWasted+b.bytesWasted, + a.stat_all + b.stat_all, + a.stat_malloc + b.stat_malloc, + a.stat_4K + b.stat_4K, + a.stat_2M + b.stat_2M, + a.stat_shared + b.stat_shared); + } + + void print(size_t numPrimitives) + { + std::stringstream str0; + str0.setf(std::ios::fixed, std::ios::floatfield); + str0 << " alloc : " + << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " + << " " + << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives); + std::cout << str0.str() << std::endl; + + std::stringstream str1; + str1.setf(std::ios::fixed, std::ios::floatfield); + str1 << " alloc : " + << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " + << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, " + << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, " + << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, " + << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives); + std::cout << str1.str() << std::endl; + + std::cout << " total : " << stat_all.str(numPrimitives) << std::endl; + std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl; + std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl; + std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl; + std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl; + } + + private: + size_t bytesUsed; + size_t bytesFree; + size_t bytesWasted; + Statistics stat_all; + Statistics stat_malloc; + Statistics stat_4K; + Statistics stat_2M; + Statistics stat_shared; + }; + + void print_blocks() + { + std::cout << " estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl; + + std::cout << " used blocks = "; + if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list(); + std::cout << "[END]" << std::endl; + + std::cout << " free blocks = "; + if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list(); + std::cout << "[END]" << std::endl; + } + + private: + + struct Block + { + static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype) + { + /* We avoid using os_malloc for small blocks as this could + * cause a risk of fragmenting the virtual address space and + * reach the limit of vm.max_map_count = 65k under Linux. */ + if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize) + atype = ALIGNED_MALLOC; + + /* we need to additionally allocate some header */ + const size_t sizeof_Header = offsetof(Block,data[0]); + bytesAllocate = sizeof_Header+bytesAllocate; + bytesReserve = sizeof_Header+bytesReserve; + + /* consume full 4k pages with using os_malloc */ + if (atype == EMBREE_OS_MALLOC) { + bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1)); + bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1)); + } + + /* either use alignedMalloc or os_malloc */ + void *ptr = nullptr; + if (atype == ALIGNED_MALLOC) + { + /* special handling for default block size */ + if (bytesAllocate == (2*PAGE_SIZE_2M)) + { + const size_t alignment = maxAlignment; + if (device) device->memoryMonitor(bytesAllocate+alignment,false); + ptr = alignedMalloc(bytesAllocate,alignment); + + /* give hint to transparently convert these pages to 2MB pages */ + const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1); + os_advise((void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block + os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M); + os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block + + return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); + } + else + { + const size_t alignment = maxAlignment; + if (device) device->memoryMonitor(bytesAllocate+alignment,false); + ptr = alignedMalloc(bytesAllocate,alignment); + return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); + } + } + else if (atype == EMBREE_OS_MALLOC) + { + if (device) device->memoryMonitor(bytesAllocate,false); + bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages); + return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages); + } + else + assert(false); + + return NULL; + } + + Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false) + : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages) + { + assert((((size_t)&data[0]) & (maxAlignment-1)) == 0); + } + + static Block* remove_shared_blocks(Block* head) + { + Block** prev_next = &head; + for (Block* block = head; block; block = block->next) { + if (block->atype == SHARED) *prev_next = block->next; + else prev_next = &block->next; + } + return head; + } + + void clear_list(MemoryMonitorInterface* device) + { + Block* block = this; + while (block) { + Block* next = block->next; + block->clear_block(device); + block = next; + } + } + + void clear_block (MemoryMonitorInterface* device) + { + const size_t sizeof_Header = offsetof(Block,data[0]); + const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes(); + + if (atype == ALIGNED_MALLOC) { + alignedFree(this); + if (device) device->memoryMonitor(-sizeof_Alloced,true); + } + + else if (atype == EMBREE_OS_MALLOC) { + size_t sizeof_This = sizeof_Header+reserveEnd; + os_free(this,sizeof_This,huge_pages); + if (device) device->memoryMonitor(-sizeof_Alloced,true); + } + + else /* if (atype == SHARED) */ { + } + } + + void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial) + { + size_t bytes = bytes_in; + assert(align <= maxAlignment); + bytes = (bytes+(align-1)) & ~(align-1); + if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr; + const size_t i = cur.fetch_add(bytes); + if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr; + if (unlikely(i > reserveEnd)) return nullptr; + bytes_in = bytes = min(bytes,reserveEnd-i); + + if (i+bytes > allocEnd) { + if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true); + } + return &data[i]; + } + + void* ptr() { + return &data[cur]; + } + + void reset_block () + { + allocEnd = max(allocEnd,(size_t)cur); + cur = 0; + } + + size_t getBlockUsedBytes() const { + return min(size_t(cur),reserveEnd); + } + + size_t getBlockFreeBytes() const { + return getBlockAllocatedBytes() - getBlockUsedBytes(); + } + + size_t getBlockAllocatedBytes() const { + return min(max(allocEnd,size_t(cur)),reserveEnd); + } + + size_t getBlockWastedBytes() const { + const size_t sizeof_Header = offsetof(Block,data[0]); + return sizeof_Header + wasted; + } + + size_t getBlockReservedBytes() const { + return reserveEnd; + } + + bool hasType(AllocationType atype_i, bool huge_pages_i) const + { + if (atype_i == ANY_TYPE ) return true; + else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages; + else return atype_i == atype; + } + + size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const { + size_t bytes = 0; + for (const Block* block = this; block; block = block->next) { + if (!block->hasType(atype,huge_pages)) continue; + bytes += block->getBlockUsedBytes(); + } + return bytes; + } + + size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const { + size_t bytes = 0; + for (const Block* block = this; block; block = block->next) { + if (!block->hasType(atype,huge_pages)) continue; + bytes += block->getBlockFreeBytes(); + } + return bytes; + } + + size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const { + size_t bytes = 0; + for (const Block* block = this; block; block = block->next) { + if (!block->hasType(atype,huge_pages)) continue; + bytes += block->getBlockWastedBytes(); + } + return bytes; + } + + size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const { + size_t bytes = 0; + for (const Block* block = this; block; block = block->next) { + if (!block->hasType(atype,huge_pages)) continue; + bytes += block->getBlockAllocatedBytes(); + } + return bytes; + } + + void print_list () + { + for (const Block* block = this; block; block = block->next) + block->print_block(); + } + + void print_block() const + { + if (atype == ALIGNED_MALLOC) std::cout << "A"; + else if (atype == EMBREE_OS_MALLOC) std::cout << "O"; + else if (atype == SHARED) std::cout << "S"; + if (huge_pages) std::cout << "H"; + size_t bytesUsed = getBlockUsedBytes(); + size_t bytesFree = getBlockFreeBytes(); + size_t bytesWasted = getBlockWastedBytes(); + std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] "; + } + + public: + std::atomic cur; //!< current location of the allocator + std::atomic allocEnd; //!< end of the allocated memory region + std::atomic reserveEnd; //!< end of the reserved memory region + Block* next; //!< pointer to next block in list + size_t wasted; //!< amount of memory wasted through block alignment + AllocationType atype; //!< allocation mode of the block + bool huge_pages; //!< whether the block uses huge pages + char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment + char data[1]; //!< here starts memory to use for allocations + }; + + private: + Device* device; + SpinLock mutex; + size_t slotMask; + std::atomic threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; + std::atomic usedBlocks; + std::atomic freeBlocks; + + std::atomic threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; +#if defined(__aarch64__) && defined(BUILD_IOS) + std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; +#else + SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; +#endif + + bool use_single_mode; + size_t defaultBlockSize; + size_t estimatedSize; + size_t growSize; + size_t maxGrowSize; + std::atomic log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove + std::atomic bytesUsed; + std::atomic bytesFree; + std::atomic bytesWasted; + static __thread ThreadLocal2* thread_local_allocator2; + static SpinLock s_thread_local_allocators_lock; + static std::vector> s_thread_local_allocators; +#if defined(__aarch64__) && defined(BUILD_IOS) + std::mutex thread_local_allocators_lock; +#else + SpinLock thread_local_allocators_lock; +#endif + std::vector thread_local_allocators; + AllocationType atype; + mvector primrefarray; //!< primrefarray used to allocate nodes + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/buffer.h b/thirdparty/embree-aarch64/kernels/common/buffer.h new file mode 100644 index 00000000000..02d319c59dd --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/buffer.h @@ -0,0 +1,263 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "device.h" + +namespace embree +{ + /*! Implements an API data buffer object. This class may or may not own the data. */ + class Buffer : public RefCount + { + public: + /*! Buffer construction */ + Buffer() + : device(nullptr), ptr(nullptr), numBytes(0), shared(false) {} + + /*! Buffer construction */ + Buffer(Device* device, size_t numBytes_in, void* ptr_in = nullptr) + : device(device), numBytes(numBytes_in) + { + device->refInc(); + + if (ptr_in) + { + shared = true; + ptr = (char*)ptr_in; + } + else + { + shared = false; + alloc(); + } + } + + /*! Buffer destruction */ + ~Buffer() { + free(); + device->refDec(); + } + + /*! this class is not copyable */ + private: + Buffer(const Buffer& other) DELETED; // do not implement + Buffer& operator =(const Buffer& other) DELETED; // do not implement + + public: + /* inits and allocates the buffer */ + void create(Device* device_in, size_t numBytes_in) + { + init(device_in, numBytes_in); + alloc(); + } + + /* inits the buffer */ + void init(Device* device_in, size_t numBytes_in) + { + free(); + device = device_in; + ptr = nullptr; + numBytes = numBytes_in; + shared = false; + } + + /*! sets shared buffer */ + void set(Device* device_in, void* ptr_in, size_t numBytes_in) + { + free(); + device = device_in; + ptr = (char*)ptr_in; + if (numBytes_in != (size_t)-1) + numBytes = numBytes_in; + shared = true; + } + + /*! allocated buffer */ + void alloc() + { + if (device) + device->memoryMonitor(this->bytes(), false); + size_t b = (this->bytes()+15) & ssize_t(-16); + ptr = (char*)alignedMalloc(b,16); + } + + /*! frees the buffer */ + void free() + { + if (shared) return; + alignedFree(ptr); + if (device) + device->memoryMonitor(-ssize_t(this->bytes()), true); + ptr = nullptr; + } + + /*! gets buffer pointer */ + void* data() + { + /* report error if buffer is not existing */ + if (!device) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer specified"); + + /* return buffer */ + return ptr; + } + + /*! returns pointer to first element */ + __forceinline char* getPtr() const { + return ptr; + } + + /*! returns the number of bytes of the buffer */ + __forceinline size_t bytes() const { + return numBytes; + } + + /*! returns true of the buffer is not empty */ + __forceinline operator bool() const { + return ptr; + } + + public: + Device* device; //!< device to report memory usage to + char* ptr; //!< pointer to buffer data + size_t numBytes; //!< number of bytes in the buffer + bool shared; //!< set if memory is shared with application + }; + + /*! An untyped contiguous range of a buffer. This class does not own the buffer content. */ + class RawBufferView + { + public: + /*! Buffer construction */ + RawBufferView() + : ptr_ofs(nullptr), stride(0), num(0), format(RTC_FORMAT_UNDEFINED), modCounter(1), modified(true), userData(0) {} + + public: + /*! sets the buffer view */ + void set(const Ref& buffer_in, size_t offset_in, size_t stride_in, size_t num_in, RTCFormat format_in) + { + if ((offset_in + stride_in * num_in) > (stride_in * buffer_in->numBytes)) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "buffer range out of bounds"); + + ptr_ofs = buffer_in->ptr + offset_in; + stride = stride_in; + num = num_in; + format = format_in; + modCounter++; + modified = true; + buffer = buffer_in; + } + + /*! returns pointer to the first element */ + __forceinline char* getPtr() const { + return ptr_ofs; + } + + /*! returns pointer to the i'th element */ + __forceinline char* getPtr(size_t i) const + { + assert(i otherModCounter; + } + + /*! mark buffer as modified or unmodified */ + __forceinline bool isLocalModified() const { + return modified; + } + + /*! clear local modified flag */ + __forceinline void clearLocalModified() { + modified = false; + } + + /*! returns true of the buffer is not empty */ + __forceinline operator bool() const { + return ptr_ofs; + } + + /*! checks padding to 16 byte check, fails hard */ + __forceinline void checkPadding16() const + { + if (ptr_ofs && num) + volatile int MAYBE_UNUSED w = *((int*)getPtr(size()-1)+3); // FIXME: is failing hard avoidable? + } + + public: + char* ptr_ofs; //!< base pointer plus offset + size_t stride; //!< stride of the buffer in bytes + size_t num; //!< number of elements in the buffer + RTCFormat format; //!< format of the buffer + unsigned int modCounter; //!< version ID of this buffer + bool modified; //!< local modified data + int userData; //!< special data + Ref buffer; //!< reference to the parent buffer + }; + + /*! A typed contiguous range of a buffer. This class does not own the buffer content. */ + template + class BufferView : public RawBufferView + { + public: + typedef T value_type; + + /*! access to the ith element of the buffer */ + __forceinline T& operator [](size_t i) { assert(i + class BufferView : public RawBufferView + { + public: + typedef Vec3fa value_type; + + /*! access to the ith element of the buffer */ + __forceinline const Vec3fa operator [](size_t i) const + { + assert(i + struct ProgressMonitorClosure : BuildProgressMonitor + { + public: + ProgressMonitorClosure (const Closure& closure) : closure(closure) {} + void operator() (size_t dn) const { closure(dn); } + private: + const Closure closure; + }; + template __forceinline const ProgressMonitorClosure BuildProgressMonitorFromClosure(const Closure& closure) { + return ProgressMonitorClosure(closure); + } + + struct LineSegments; + struct TriangleMesh; + struct QuadMesh; + struct UserGeometry; + + class Scene; + + typedef void (*createLineSegmentsAccelTy)(Scene* scene, LineSegments* mesh, AccelData*& accel, Builder*& builder); + typedef void (*createTriangleMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); + typedef void (*createQuadMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); + typedef void (*createUserGeometryAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); + +} diff --git a/thirdparty/embree-aarch64/kernels/common/context.h b/thirdparty/embree-aarch64/kernels/common/context.h new file mode 100644 index 00000000000..d0185a74f2e --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/context.h @@ -0,0 +1,131 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "rtcore.h" +#include "point_query.h" + +namespace embree +{ + class Scene; + + struct IntersectContext + { + public: + __forceinline IntersectContext(Scene* scene, RTCIntersectContext* user_context) + : scene(scene), user(user_context) {} + + __forceinline bool hasContextFilter() const { + return user->filter != nullptr; + } + + __forceinline bool isCoherent() const { + return embree::isCoherent(user->flags); + } + + __forceinline bool isIncoherent() const { + return embree::isIncoherent(user->flags); + } + + public: + Scene* scene; + RTCIntersectContext* user; + }; + + template + __forceinline Vec4vf enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3vf& ray_org, const Vec4vf& v) + { +#if RTC_MIN_WIDTH + const vfloat d = length(Vec3vf(v) - ray_org); + const vfloat r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w); + return Vec4vf(v.x,v.y,v.z,r); +#else + return v; +#endif + } + + template + __forceinline Vec3ff enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3fa& ray_org, const Vec3ff& v) + { +#if RTC_MIN_WIDTH + const float d = length(Vec3fa(v) - ray_org); + const float r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w); + return Vec3ff(v.x,v.y,v.z,r); +#else + return v; +#endif + } + + enum PointQueryType + { + POINT_QUERY_TYPE_UNDEFINED = 0, + POINT_QUERY_TYPE_SPHERE = 1, + POINT_QUERY_TYPE_AABB = 2, + }; + + typedef bool (*PointQueryFunction)(struct RTCPointQueryFunctionArguments* args); + + struct PointQueryContext + { + public: + __forceinline PointQueryContext(Scene* scene, + PointQuery* query_ws, + PointQueryType query_type, + PointQueryFunction func, + RTCPointQueryContext* userContext, + float similarityScale, + void* userPtr) + : scene(scene) + , query_ws(query_ws) + , query_type(query_type) + , func(func) + , userContext(userContext) + , similarityScale(similarityScale) + , userPtr(userPtr) + , primID(RTC_INVALID_GEOMETRY_ID) + , geomID(RTC_INVALID_GEOMETRY_ID) + , query_radius(query_ws->radius) + { + if (query_type == POINT_QUERY_TYPE_AABB) { + assert(similarityScale == 0.f); + updateAABB(); + } + if (userContext->instStackSize == 0) { + assert(similarityScale == 1.f); + } + } + + public: + __forceinline void updateAABB() + { + if (likely(query_ws->radius == (float)inf || userContext->instStackSize == 0)) { + query_radius = Vec3fa(query_ws->radius); + return; + } + + const AffineSpace3fa m = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]); + BBox3fa bbox(Vec3fa(-query_ws->radius), Vec3fa(query_ws->radius)); + bbox = xfmBounds(m, bbox); + query_radius = 0.5f * (bbox.upper - bbox.lower); + } + +public: + Scene* scene; + + PointQuery* query_ws; // the original world space point query + PointQueryType query_type; + PointQueryFunction func; + RTCPointQueryContext* userContext; + const float similarityScale; + + void* userPtr; + + unsigned int primID; + unsigned int geomID; + + Vec3fa query_radius; // used if the query is converted to an AABB internally + }; +} + diff --git a/thirdparty/embree-aarch64/kernels/common/default.h b/thirdparty/embree-aarch64/kernels/common/default.h new file mode 100644 index 00000000000..709119163b8 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/default.h @@ -0,0 +1,273 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../../common/sys/platform.h" +#include "../../common/sys/sysinfo.h" +#include "../../common/sys/thread.h" +#include "../../common/sys/alloc.h" +#include "../../common/sys/ref.h" +#include "../../common/sys/intrinsics.h" +#include "../../common/sys/atomic.h" +#include "../../common/sys/mutex.h" +#include "../../common/sys/vector.h" +#include "../../common/sys/array.h" +#include "../../common/sys/string.h" +#include "../../common/sys/regression.h" +#include "../../common/sys/vector.h" + +#include "../../common/math/math.h" +#include "../../common/math/transcendental.h" +#include "../../common/simd/simd.h" +#include "../../common/math/vec2.h" +#include "../../common/math/vec3.h" +#include "../../common/math/vec4.h" +#include "../../common/math/vec2fa.h" +#include "../../common/math/vec3fa.h" +#include "../../common/math/interval.h" +#include "../../common/math/bbox.h" +#include "../../common/math/obbox.h" +#include "../../common/math/lbbox.h" +#include "../../common/math/linearspace2.h" +#include "../../common/math/linearspace3.h" +#include "../../common/math/affinespace.h" +#include "../../common/math/range.h" +#include "../../common/lexers/tokenstream.h" + +#include "../../common/tasking/taskscheduler.h" + +#define COMMA , + +#include "../config.h" +#include "isa.h" +#include "stat.h" +#include "profile.h" +#include "rtcore.h" +#include "vector.h" +#include "state.h" +#include "instance_stack.h" + +#include +#include +#include +#include +#include +#include + +#if !defined(_DEBUG) && defined(BUILD_IOS) +#undef assert +#define assert(_EXPR) +#endif + +namespace embree +{ + //////////////////////////////////////////////////////////////////////////////// + /// Vec2 shortcuts + //////////////////////////////////////////////////////////////////////////////// + + template using Vec2vf = Vec2>; + template using Vec2vd = Vec2>; + template using Vec2vr = Vec2>; + template using Vec2vi = Vec2>; + template using Vec2vl = Vec2>; + template using Vec2vb = Vec2>; + template using Vec2vbf = Vec2>; + template using Vec2vbd = Vec2>; + + typedef Vec2 Vec2vf4; + typedef Vec2 Vec2vd4; + typedef Vec2 Vec2vr4; + typedef Vec2 Vec2vi4; + typedef Vec2 Vec2vl4; + typedef Vec2 Vec2vb4; + typedef Vec2 Vec2vbf4; + typedef Vec2 Vec2vbd4; + + typedef Vec2 Vec2vf8; + typedef Vec2 Vec2vd8; + typedef Vec2 Vec2vr8; + typedef Vec2 Vec2vi8; + typedef Vec2 Vec2vl8; + typedef Vec2 Vec2vb8; + typedef Vec2 Vec2vbf8; + typedef Vec2 Vec2vbd8; + + typedef Vec2 Vec2vf16; + typedef Vec2 Vec2vd16; + typedef Vec2 Vec2vr16; + typedef Vec2 Vec2vi16; + typedef Vec2 Vec2vl16; + typedef Vec2 Vec2vb16; + typedef Vec2 Vec2vbf16; + typedef Vec2 Vec2vbd16; + + typedef Vec2 Vec2vfx; + typedef Vec2 Vec2vdx; + typedef Vec2 Vec2vrx; + typedef Vec2 Vec2vix; + typedef Vec2 Vec2vlx; + typedef Vec2 Vec2vbx; + typedef Vec2 Vec2vbfx; + typedef Vec2 Vec2vbdx; + + //////////////////////////////////////////////////////////////////////////////// + /// Vec3 shortcuts + //////////////////////////////////////////////////////////////////////////////// + + template using Vec3vf = Vec3>; + template using Vec3vd = Vec3>; + template using Vec3vr = Vec3>; + template using Vec3vi = Vec3>; + template using Vec3vl = Vec3>; + template using Vec3vb = Vec3>; + template using Vec3vbf = Vec3>; + template using Vec3vbd = Vec3>; + + typedef Vec3 Vec3vf4; + typedef Vec3 Vec3vd4; + typedef Vec3 Vec3vr4; + typedef Vec3 Vec3vi4; + typedef Vec3 Vec3vl4; + typedef Vec3 Vec3vb4; + typedef Vec3 Vec3vbf4; + typedef Vec3 Vec3vbd4; + + typedef Vec3 Vec3vf8; + typedef Vec3 Vec3vd8; + typedef Vec3 Vec3vr8; + typedef Vec3 Vec3vi8; + typedef Vec3 Vec3vl8; + typedef Vec3 Vec3vb8; + typedef Vec3 Vec3vbf8; + typedef Vec3 Vec3vbd8; + + typedef Vec3 Vec3vf16; + typedef Vec3 Vec3vd16; + typedef Vec3 Vec3vr16; + typedef Vec3 Vec3vi16; + typedef Vec3 Vec3vl16; + typedef Vec3 Vec3vb16; + typedef Vec3 Vec3vbf16; + typedef Vec3 Vec3vbd16; + + typedef Vec3 Vec3vfx; + typedef Vec3 Vec3vdx; + typedef Vec3 Vec3vrx; + typedef Vec3 Vec3vix; + typedef Vec3 Vec3vlx; + typedef Vec3 Vec3vbx; + typedef Vec3 Vec3vbfx; + typedef Vec3 Vec3vbdx; + + //////////////////////////////////////////////////////////////////////////////// + /// Vec4 shortcuts + //////////////////////////////////////////////////////////////////////////////// + + template using Vec4vf = Vec4>; + template using Vec4vd = Vec4>; + template using Vec4vr = Vec4>; + template using Vec4vi = Vec4>; + template using Vec4vl = Vec4>; + template using Vec4vb = Vec4>; + template using Vec4vbf = Vec4>; + template using Vec4vbd = Vec4>; + + typedef Vec4 Vec4vf4; + typedef Vec4 Vec4vd4; + typedef Vec4 Vec4vr4; + typedef Vec4 Vec4vi4; + typedef Vec4 Vec4vl4; + typedef Vec4 Vec4vb4; + typedef Vec4 Vec4vbf4; + typedef Vec4 Vec4vbd4; + + typedef Vec4 Vec4vf8; + typedef Vec4 Vec4vd8; + typedef Vec4 Vec4vr8; + typedef Vec4 Vec4vi8; + typedef Vec4 Vec4vl8; + typedef Vec4 Vec4vb8; + typedef Vec4 Vec4vbf8; + typedef Vec4 Vec4vbd8; + + typedef Vec4 Vec4vf16; + typedef Vec4 Vec4vd16; + typedef Vec4 Vec4vr16; + typedef Vec4 Vec4vi16; + typedef Vec4 Vec4vl16; + typedef Vec4 Vec4vb16; + typedef Vec4 Vec4vbf16; + typedef Vec4 Vec4vbd16; + + typedef Vec4 Vec4vfx; + typedef Vec4 Vec4vdx; + typedef Vec4 Vec4vrx; + typedef Vec4 Vec4vix; + typedef Vec4 Vec4vlx; + typedef Vec4 Vec4vbx; + typedef Vec4 Vec4vbfx; + typedef Vec4 Vec4vbdx; + + //////////////////////////////////////////////////////////////////////////////// + /// Other shortcuts + //////////////////////////////////////////////////////////////////////////////// + + template using BBox3vf = BBox>; + typedef BBox BBox3vf4; + typedef BBox BBox3vf8; + typedef BBox BBox3vf16; + + /* calculate time segment itime and fractional time ftime */ + __forceinline int getTimeSegment(float time, float numTimeSegments, float& ftime) + { + const float timeScaled = time * numTimeSegments; + const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f); + ftime = timeScaled - itimef; + return int(itimef); + } + + __forceinline int getTimeSegment(float time, float start_time, float end_time, float numTimeSegments, float& ftime) + { + const float timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments; + const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f); + ftime = timeScaled - itimef; + return int(itimef); + } + + template + __forceinline vint getTimeSegment(const vfloat& time, const vfloat& numTimeSegments, vfloat& ftime) + { + const vfloat timeScaled = time * numTimeSegments; + const vfloat itimef = clamp(floor(timeScaled), vfloat(zero), numTimeSegments-1.0f); + ftime = timeScaled - itimef; + return vint(itimef); + } + + template + __forceinline vint getTimeSegment(const vfloat& time, const vfloat& start_time, const vfloat& end_time, const vfloat& numTimeSegments, vfloat& ftime) + { + const vfloat timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments; + const vfloat itimef = clamp(floor(timeScaled), vfloat(zero), numTimeSegments-1.0f); + ftime = timeScaled - itimef; + return vint(itimef); + } + + /* calculate overlapping time segment range */ + __forceinline range getTimeSegmentRange(const BBox1f& time_range, float numTimeSegments) + { + const float round_up = 1.0f+2.0f*float(ulp); // corrects inaccuracies to precisely match time step + const float round_down = 1.0f-2.0f*float(ulp); + const int itime_lower = (int)max(floor(round_up *time_range.lower*numTimeSegments), 0.0f); + const int itime_upper = (int)min(ceil (round_down*time_range.upper*numTimeSegments), numTimeSegments); + return make_range(itime_lower, itime_upper); + } + + /* calculate overlapping time segment range */ + __forceinline range getTimeSegmentRange(const BBox1f& range, BBox1f time_range, float numTimeSegments) + { + const float lower = (range.lower-time_range.lower)/time_range.size(); + const float upper = (range.upper-time_range.lower)/time_range.size(); + return getTimeSegmentRange(BBox1f(lower,upper),numTimeSegments); + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/device.cpp b/thirdparty/embree-aarch64/kernels/common/device.cpp new file mode 100644 index 00000000000..16ec11b8929 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/device.cpp @@ -0,0 +1,567 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "device.h" +#include "../hash.h" +#include "scene_triangle_mesh.h" +#include "scene_user_geometry.h" +#include "scene_instance.h" +#include "scene_curves.h" +#include "scene_subdiv_mesh.h" + +#include "../subdiv/tessellation_cache.h" + +#include "acceln.h" +#include "geometry.h" + +#include "../geometry/cylinder.h" + +#include "../bvh/bvh4_factory.h" +#include "../bvh/bvh8_factory.h" + +#include "../../common/tasking/taskscheduler.h" +#include "../../common/sys/alloc.h" + +namespace embree +{ + /*! some global variables that can be set via rtcSetParameter1i for debugging purposes */ + ssize_t Device::debug_int0 = 0; + ssize_t Device::debug_int1 = 0; + ssize_t Device::debug_int2 = 0; + ssize_t Device::debug_int3 = 0; + + DECLARE_SYMBOL2(RayStreamFilterFuncs,rayStreamFilterFuncs); + + static MutexSys g_mutex; + static std::map g_cache_size_map; + static std::map g_num_threads_map; + + Device::Device (const char* cfg) + { + /* check that CPU supports lowest ISA */ + if (!hasISA(ISA)) { + throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support " ISA_STR); + } + + /* set default frequency level for detected CPU */ + switch (getCPUModel()) { + case CPU::UNKNOWN: frequency_level = FREQUENCY_SIMD256; break; + case CPU::XEON_ICE_LAKE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_ICE_LAKE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_TIGER_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE_COMET_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE_CANNON_LAKE:frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE_KABY_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::XEON_SKY_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE_SKY_LAKE: frequency_level = FREQUENCY_SIMD128; break; + case CPU::XEON_BROADWELL: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_BROADWELL: frequency_level = FREQUENCY_SIMD256; break; + case CPU::XEON_HASWELL: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_HASWELL: frequency_level = FREQUENCY_SIMD256; break; + case CPU::XEON_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::CORE_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::SANDY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break; + case CPU::NEHALEM: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE2: frequency_level = FREQUENCY_SIMD128; break; + case CPU::CORE1: frequency_level = FREQUENCY_SIMD128; break; + } + + /* initialize global state */ +#if defined(EMBREE_CONFIG) + State::parseString(EMBREE_CONFIG); +#endif + State::parseString(cfg); + if (!ignore_config_files && FileName::executableFolder() != FileName("")) + State::parseFile(FileName::executableFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR))); + if (!ignore_config_files && FileName::homeFolder() != FileName("")) + State::parseFile(FileName::homeFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR))); + State::verify(); + + /* check whether selected ISA is supported by the HW, as the user could have forced an unsupported ISA */ + if (!checkISASupport()) { + throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support selected ISA"); + } + + /*! do some internal tests */ + assert(isa::Cylinder::verify()); + + /*! enable huge page support if desired */ +#if defined(__WIN32__) + if (State::enable_selockmemoryprivilege) + State::hugepages_success &= win_enable_selockmemoryprivilege(State::verbosity(3)); +#endif + State::hugepages_success &= os_init(State::hugepages,State::verbosity(3)); + + /*! set tessellation cache size */ + setCacheSize( State::tessellation_cache_size ); + + /*! enable some floating point exceptions to catch bugs */ + if (State::float_exceptions) + { + int exceptions = _MM_MASK_MASK; + //exceptions &= ~_MM_MASK_INVALID; + exceptions &= ~_MM_MASK_DENORM; + exceptions &= ~_MM_MASK_DIV_ZERO; + //exceptions &= ~_MM_MASK_OVERFLOW; + //exceptions &= ~_MM_MASK_UNDERFLOW; + //exceptions &= ~_MM_MASK_INEXACT; + _MM_SET_EXCEPTION_MASK(exceptions); + } + + /* print info header */ + if (State::verbosity(1)) + print(); + if (State::verbosity(2)) + State::print(); + + /* register all algorithms */ + bvh4_factory = make_unique(new BVH4Factory(enabled_builder_cpu_features, enabled_cpu_features)); + +#if defined(EMBREE_TARGET_SIMD8) + bvh8_factory = make_unique(new BVH8Factory(enabled_builder_cpu_features, enabled_cpu_features)); +#endif + + /* setup tasking system */ + initTaskingSystem(numThreads); + + /* ray stream SOA to AOS conversion */ +#if defined(EMBREE_RAY_PACKETS) + RayStreamFilterFuncsType rayStreamFilterFuncs; + SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(enabled_cpu_features,rayStreamFilterFuncs); + rayStreamFilters = rayStreamFilterFuncs(); +#endif + } + + Device::~Device () + { + setCacheSize(0); + exitTaskingSystem(); + } + + std::string getEnabledTargets() + { + std::string v; +#if defined(EMBREE_TARGET_SSE2) + v += "SSE2 "; +#endif +#if defined(EMBREE_TARGET_SSE42) + v += "SSE4.2 "; +#endif +#if defined(EMBREE_TARGET_AVX) + v += "AVX "; +#endif +#if defined(EMBREE_TARGET_AVX2) + v += "AVX2 "; +#endif +#if defined(EMBREE_TARGET_AVX512KNL) + v += "AVX512KNL "; +#endif +#if defined(EMBREE_TARGET_AVX512SKX) + v += "AVX512SKX "; +#endif + return v; + } + + std::string getEmbreeFeatures() + { + std::string v; +#if defined(EMBREE_RAY_MASK) + v += "raymasks "; +#endif +#if defined (EMBREE_BACKFACE_CULLING) + v += "backfaceculling "; +#endif +#if defined (EMBREE_BACKFACE_CULLING_CURVES) + v += "backfacecullingcurves "; +#endif +#if defined(EMBREE_FILTER_FUNCTION) + v += "intersection_filter "; +#endif +#if defined (EMBREE_COMPACT_POLYS) + v += "compact_polys "; +#endif + return v; + } + + void Device::print() + { + const int cpu_features = getCPUFeatures(); + std::cout << std::endl; + std::cout << "Embree Ray Tracing Kernels " << RTC_VERSION_STRING << " (" << RTC_HASH << ")" << std::endl; + std::cout << " Compiler : " << getCompilerName() << std::endl; + std::cout << " Build : "; +#if defined(DEBUG) + std::cout << "Debug " << std::endl; +#else + std::cout << "Release " << std::endl; +#endif + std::cout << " Platform : " << getPlatformName() << std::endl; + std::cout << " CPU : " << stringOfCPUModel(getCPUModel()) << " (" << getCPUVendor() << ")" << std::endl; + std::cout << " Threads : " << getNumberOfLogicalThreads() << std::endl; + std::cout << " ISA : " << stringOfCPUFeatures(cpu_features) << std::endl; + std::cout << " Targets : " << supportedTargetList(cpu_features) << std::endl; + const bool hasFTZ = _mm_getcsr() & _MM_FLUSH_ZERO_ON; + const bool hasDAZ = _mm_getcsr() & _MM_DENORMALS_ZERO_ON; + std::cout << " MXCSR : " << "FTZ=" << hasFTZ << ", DAZ=" << hasDAZ << std::endl; + std::cout << " Config" << std::endl; + std::cout << " Threads : " << (numThreads ? toString(numThreads) : std::string("default")) << std::endl; + std::cout << " ISA : " << stringOfCPUFeatures(enabled_cpu_features) << std::endl; + std::cout << " Targets : " << supportedTargetList(enabled_cpu_features) << " (supported)" << std::endl; + std::cout << " " << getEnabledTargets() << " (compile time enabled)" << std::endl; + std::cout << " Features: " << getEmbreeFeatures() << std::endl; + std::cout << " Tasking : "; +#if defined(TASKING_TBB) + std::cout << "TBB" << TBB_VERSION_MAJOR << "." << TBB_VERSION_MINOR << " "; + #if TBB_INTERFACE_VERSION >= 12002 + std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << TBB_runtime_interface_version() << " "; + #else + std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << tbb::TBB_runtime_interface_version() << " "; + #endif +#endif +#if defined(TASKING_INTERNAL) + std::cout << "internal_tasking_system "; +#endif +#if defined(TASKING_GCD) && defined(BUILD_IOS) + std::cout << "GCD tasking system "; +#endif +#if defined(TASKING_PPL) + std::cout << "PPL "; +#endif + std::cout << std::endl; + + /* check of FTZ and DAZ flags are set in CSR */ + if (!hasFTZ || !hasDAZ) + { +#if !defined(_DEBUG) + if (State::verbosity(1)) +#endif + { + std::cout << std::endl; + std::cout << "================================================================================" << std::endl; + std::cout << " WARNING: \"Flush to Zero\" or \"Denormals are Zero\" mode not enabled " << std::endl + << " in the MXCSR control and status register. This can have a severe " << std::endl + << " performance impact. Please enable these modes for each application " << std::endl + << " thread the following way:" << std::endl + << std::endl + << " #include \"xmmintrin.h\"" << std::endl + << " #include \"pmmintrin.h\"" << std::endl + << std::endl + << " _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);" << std::endl + << " _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);" << std::endl; + std::cout << "================================================================================" << std::endl; + std::cout << std::endl; + } + } + std::cout << std::endl; + } + + void Device::setDeviceErrorCode(RTCError error) + { + RTCError* stored_error = errorHandler.error(); + if (*stored_error == RTC_ERROR_NONE) + *stored_error = error; + } + + RTCError Device::getDeviceErrorCode() + { + RTCError* stored_error = errorHandler.error(); + RTCError error = *stored_error; + *stored_error = RTC_ERROR_NONE; + return error; + } + + void Device::setThreadErrorCode(RTCError error) + { + RTCError* stored_error = g_errorHandler.error(); + if (*stored_error == RTC_ERROR_NONE) + *stored_error = error; + } + + RTCError Device::getThreadErrorCode() + { + RTCError* stored_error = g_errorHandler.error(); + RTCError error = *stored_error; + *stored_error = RTC_ERROR_NONE; + return error; + } + + void Device::process_error(Device* device, RTCError error, const char* str) + { + /* store global error code when device construction failed */ + if (!device) + return setThreadErrorCode(error); + + /* print error when in verbose mode */ + if (device->verbosity(1)) + { + switch (error) { + case RTC_ERROR_NONE : std::cerr << "Embree: No error"; break; + case RTC_ERROR_UNKNOWN : std::cerr << "Embree: Unknown error"; break; + case RTC_ERROR_INVALID_ARGUMENT : std::cerr << "Embree: Invalid argument"; break; + case RTC_ERROR_INVALID_OPERATION: std::cerr << "Embree: Invalid operation"; break; + case RTC_ERROR_OUT_OF_MEMORY : std::cerr << "Embree: Out of memory"; break; + case RTC_ERROR_UNSUPPORTED_CPU : std::cerr << "Embree: Unsupported CPU"; break; + default : std::cerr << "Embree: Invalid error code"; break; + }; + if (str) std::cerr << ", (" << str << ")"; + std::cerr << std::endl; + } + + /* call user specified error callback */ + if (device->error_function) + device->error_function(device->error_function_userptr,error,str); + + /* record error code */ + device->setDeviceErrorCode(error); + } + + void Device::memoryMonitor(ssize_t bytes, bool post) + { + if (State::memory_monitor_function && bytes != 0) { + if (!State::memory_monitor_function(State::memory_monitor_userptr,bytes,post)) { + if (bytes > 0) { // only throw exception when we allocate memory to never throw inside a destructor + throw_RTCError(RTC_ERROR_OUT_OF_MEMORY,"memory monitor forced termination"); + } + } + } + } + + size_t getMaxNumThreads() + { + size_t maxNumThreads = 0; + for (std::map::iterator i=g_num_threads_map.begin(); i != g_num_threads_map.end(); i++) + maxNumThreads = max(maxNumThreads, (*i).second); + if (maxNumThreads == 0) + maxNumThreads = std::numeric_limits::max(); + return maxNumThreads; + } + + size_t getMaxCacheSize() + { + size_t maxCacheSize = 0; + for (std::map::iterator i=g_cache_size_map.begin(); i!= g_cache_size_map.end(); i++) + maxCacheSize = max(maxCacheSize, (*i).second); + return maxCacheSize; + } + + void Device::setCacheSize(size_t bytes) + { +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + Lock lock(g_mutex); + if (bytes == 0) g_cache_size_map.erase(this); + else g_cache_size_map[this] = bytes; + + size_t maxCacheSize = getMaxCacheSize(); + resizeTessellationCache(maxCacheSize); +#endif + } + + void Device::initTaskingSystem(size_t numThreads) + { + Lock lock(g_mutex); + if (numThreads == 0) + g_num_threads_map[this] = std::numeric_limits::max(); + else + g_num_threads_map[this] = numThreads; + + /* create task scheduler */ + size_t maxNumThreads = getMaxNumThreads(); + TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads); +#if USE_TASK_ARENA + const size_t nThreads = min(maxNumThreads,TaskScheduler::threadCount()); + const size_t uThreads = min(max(numUserThreads,(size_t)1),nThreads); + arena = make_unique(new tbb::task_arena((int)nThreads,(unsigned int)uThreads)); +#endif + } + + void Device::exitTaskingSystem() + { + Lock lock(g_mutex); + g_num_threads_map.erase(this); + + /* terminate tasking system */ + if (g_num_threads_map.size() == 0) { + TaskScheduler::destroy(); + } + /* or configure new number of threads */ + else { + size_t maxNumThreads = getMaxNumThreads(); + TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads); + } +#if USE_TASK_ARENA + arena.reset(); +#endif + } + + void Device::setProperty(const RTCDeviceProperty prop, ssize_t val) + { + /* hidden internal properties */ + switch ((size_t)prop) + { + case 1000000: debug_int0 = val; return; + case 1000001: debug_int1 = val; return; + case 1000002: debug_int2 = val; return; + case 1000003: debug_int3 = val; return; + } + + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown writable property"); + } + + ssize_t Device::getProperty(const RTCDeviceProperty prop) + { + size_t iprop = (size_t)prop; + + /* get name of internal regression test */ + if (iprop >= 2000000 && iprop < 3000000) + { + RegressionTest* test = getRegressionTest(iprop-2000000); + if (test) return (ssize_t) test->name.c_str(); + else return 0; + } + + /* run internal regression test */ + if (iprop >= 3000000 && iprop < 4000000) + { + RegressionTest* test = getRegressionTest(iprop-3000000); + if (test) return test->run(); + else return 0; + } + + /* documented properties */ + switch (prop) + { + case RTC_DEVICE_PROPERTY_VERSION_MAJOR: return RTC_VERSION_MAJOR; + case RTC_DEVICE_PROPERTY_VERSION_MINOR: return RTC_VERSION_MINOR; + case RTC_DEVICE_PROPERTY_VERSION_PATCH: return RTC_VERSION_PATCH; + case RTC_DEVICE_PROPERTY_VERSION : return RTC_VERSION; + +#if defined(EMBREE_TARGET_SIMD4) && defined(EMBREE_RAY_PACKETS) + case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED: return hasISA(SSE2); +#else + case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_TARGET_SIMD8) && defined(EMBREE_RAY_PACKETS) + case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED: return hasISA(AVX); +#else + case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_TARGET_SIMD16) && defined(EMBREE_RAY_PACKETS) + case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return hasISA(AVX512KNL) | hasISA(AVX512SKX); +#else + case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_RAY_PACKETS) + case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_RAY_MASK) + case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_BACKFACE_CULLING) + case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 1; +#else + case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 0; +#endif + +#if defined(EMBREE_BACKFACE_CULLING_CURVES) + case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 1; +#else + case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 0; +#endif + +#if defined(EMBREE_COMPACT_POLYS) + case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 1; +#else + case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 0; +#endif + +#if defined(EMBREE_FILTER_FUNCTION) + case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_IGNORE_INVALID_RAYS) + case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 1; +#else + case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 0; +#endif + +#if defined(TASKING_INTERNAL) + case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 0; +#endif + +#if defined(TASKING_TBB) + case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 1; +#endif + +#if defined(TASKING_PPL) + case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 2; +#endif + +#if defined(TASKING_GCD) && defined(BUILD_IOS) + case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 3; +#endif + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_CURVE) + case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_USER) + case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(EMBREE_GEOMETRY_POINT) + case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 0; +#endif + +#if defined(TASKING_PPL) + case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0; +#elif defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8) + case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0; +#else + case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 1; +#endif + +#if defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION + case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 1; +#else + case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 0; +#endif + + default: throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown readable property"); break; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/device.h b/thirdparty/embree-aarch64/kernels/common/device.h new file mode 100644 index 00000000000..e9a81bb1091 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/device.h @@ -0,0 +1,85 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "state.h" +#include "accel.h" + +namespace embree +{ + class BVH4Factory; + class BVH8Factory; + + class Device : public State, public MemoryMonitorInterface + { + ALIGNED_CLASS_(16); + + public: + + /*! Device construction */ + Device (const char* cfg); + + /*! Device destruction */ + virtual ~Device (); + + /*! prints info about the device */ + void print(); + + /*! sets the error code */ + void setDeviceErrorCode(RTCError error); + + /*! returns and clears the error code */ + RTCError getDeviceErrorCode(); + + /*! sets the error code */ + static void setThreadErrorCode(RTCError error); + + /*! returns and clears the error code */ + static RTCError getThreadErrorCode(); + + /*! processes error codes, do not call directly */ + static void process_error(Device* device, RTCError error, const char* str); + + /*! invokes the memory monitor callback */ + void memoryMonitor(ssize_t bytes, bool post); + + /*! sets the size of the software cache. */ + void setCacheSize(size_t bytes); + + /*! sets a property */ + void setProperty(const RTCDeviceProperty prop, ssize_t val); + + /*! gets a property */ + ssize_t getProperty(const RTCDeviceProperty prop); + + private: + + /*! initializes the tasking system */ + void initTaskingSystem(size_t numThreads); + + /*! shuts down the tasking system */ + void exitTaskingSystem(); + + /*! some variables that can be set via rtcSetParameter1i for debugging purposes */ + public: + static ssize_t debug_int0; + static ssize_t debug_int1; + static ssize_t debug_int2; + static ssize_t debug_int3; + + public: + std::unique_ptr bvh4_factory; +#if defined(EMBREE_TARGET_SIMD8) + std::unique_ptr bvh8_factory; +#endif + +#if USE_TASK_ARENA + std::unique_ptr arena; +#endif + + /* ray streams filter */ + RayStreamFilterFuncs rayStreamFilters; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/geometry.cpp b/thirdparty/embree-aarch64/kernels/common/geometry.cpp new file mode 100644 index 00000000000..b3aa8e33960 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/geometry.cpp @@ -0,0 +1,259 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "geometry.h" +#include "scene.h" + +namespace embree +{ + const char* Geometry::gtype_names[Geometry::GTY_END] = + { + "flat_linear_curve", + "round_linear_curve", + "oriented_linear_curve", + "", + "flat_bezier_curve", + "round_bezier_curve", + "oriented_bezier_curve", + "", + "flat_bspline_curve", + "round_bspline_curve", + "oriented_bspline_curve", + "", + "flat_hermite_curve", + "round_hermite_curve", + "oriented_hermite_curve", + "", + "flat_catmull_rom_curve", + "round_catmull_rom_curve", + "oriented_catmull_rom_curve", + "", + "triangles", + "quads", + "grid", + "subdivs", + "", + "sphere", + "disc", + "oriented_disc", + "", + "usergeom", + "instance_cheap", + "instance_expensive", + }; + + Geometry::Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps) + : device(device), userPtr(nullptr), + numPrimitives(numPrimitives), numTimeSteps(unsigned(numTimeSteps)), fnumTimeSegments(float(numTimeSteps-1)), time_range(0.0f,1.0f), + mask(-1), + gtype(gtype), + gsubtype(GTY_SUBTYPE_DEFAULT), + quality(RTC_BUILD_QUALITY_MEDIUM), + state((unsigned)State::MODIFIED), + enabled(true), + intersectionFilterN(nullptr), occlusionFilterN(nullptr), pointQueryFunc(nullptr) + { + device->refInc(); + } + + Geometry::~Geometry() + { + device->refDec(); + } + + void Geometry::setNumPrimitives(unsigned int numPrimitives_in) + { + if (numPrimitives_in == numPrimitives) return; + + numPrimitives = numPrimitives_in; + + Geometry::update(); + } + + void Geometry::setNumTimeSteps (unsigned int numTimeSteps_in) + { + if (numTimeSteps_in == numTimeSteps) { + return; + } + + numTimeSteps = numTimeSteps_in; + fnumTimeSegments = float(numTimeSteps_in-1); + + Geometry::update(); + } + + void Geometry::setTimeRange (const BBox1f range) + { + time_range = range; + Geometry::update(); + } + + void Geometry::update() + { + ++modCounter_; // FIXME: required? + state = (unsigned)State::MODIFIED; + } + + void Geometry::commit() + { + ++modCounter_; + state = (unsigned)State::COMMITTED; + } + + void Geometry::preCommit() + { + if (State::MODIFIED == (State)state) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"geometry not committed"); + } + + void Geometry::postCommit() + { + } + + void Geometry::enable () + { + if (isEnabled()) + return; + + enabled = true; + ++modCounter_; + } + + void Geometry::disable () + { + if (isDisabled()) + return; + + enabled = false; + ++modCounter_; + } + + void Geometry::setUserData (void* ptr) + { + userPtr = ptr; + } + + void Geometry::setIntersectionFilterFunctionN (RTCFilterFunctionN filter) + { + if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH))) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); + + intersectionFilterN = filter; + } + + void Geometry::setOcclusionFilterFunctionN (RTCFilterFunctionN filter) + { + if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH))) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); + + occlusionFilterN = filter; + } + + void Geometry::setPointQueryFunction (RTCPointQueryFunction func) + { + pointQueryFunc = func; + } + + void Geometry::interpolateN(const RTCInterpolateNArguments* const args) + { + const void* valid_i = args->valid; + const unsigned* primIDs = args->primIDs; + const float* u = args->u; + const float* v = args->v; + unsigned int N = args->N; + RTCBufferType bufferType = args->bufferType; + unsigned int bufferSlot = args->bufferSlot; + float* P = args->P; + float* dPdu = args->dPdu; + float* dPdv = args->dPdv; + float* ddPdudu = args->ddPdudu; + float* ddPdvdv = args->ddPdvdv; + float* ddPdudv = args->ddPdudv; + unsigned int valueCount = args->valueCount; + + if (valueCount > 256) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximally 256 floating point values can be interpolated per vertex"); + const int* valid = (const int*) valid_i; + + __aligned(64) float P_tmp[256]; + __aligned(64) float dPdu_tmp[256]; + __aligned(64) float dPdv_tmp[256]; + __aligned(64) float ddPdudu_tmp[256]; + __aligned(64) float ddPdvdv_tmp[256]; + __aligned(64) float ddPdudv_tmp[256]; + + float* Pt = P ? P_tmp : nullptr; + float* dPdut = nullptr, *dPdvt = nullptr; + if (dPdu) { dPdut = dPdu_tmp; dPdvt = dPdv_tmp; } + float* ddPdudut = nullptr, *ddPdvdvt = nullptr, *ddPdudvt = nullptr; + if (ddPdudu) { ddPdudut = ddPdudu_tmp; ddPdvdvt = ddPdvdv_tmp; ddPdudvt = ddPdudv_tmp; } + + for (unsigned int i=0; iprimID < size()); + + RTCPointQueryFunctionArguments args; + args.query = (RTCPointQuery*)context->query_ws; + args.userPtr = context->userPtr; + args.primID = context->primID; + args.geomID = context->geomID; + args.context = context->userContext; + args.similarityScale = context->similarityScale; + + bool update = false; + if(context->func) update |= context->func(&args); + if(pointQueryFunc) update |= pointQueryFunc(&args); + + if (update && context->userContext->instStackSize > 0) + { + // update point query + if (context->query_type == POINT_QUERY_TYPE_AABB) { + context->updateAABB(); + } else { + assert(context->similarityScale > 0.f); + query->radius = context->query_ws->radius * context->similarityScale; + } + } + return update; + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/geometry.h b/thirdparty/embree-aarch64/kernels/common/geometry.h new file mode 100644 index 00000000000..953974bfd29 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/geometry.h @@ -0,0 +1,582 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "device.h" +#include "buffer.h" +#include "../common/point_query.h" +#include "../builders/priminfo.h" + +namespace embree +{ + class Scene; + class Geometry; + + struct GeometryCounts + { + __forceinline GeometryCounts() + : numFilterFunctions(0), + numTriangles(0), numMBTriangles(0), + numQuads(0), numMBQuads(0), + numBezierCurves(0), numMBBezierCurves(0), + numLineSegments(0), numMBLineSegments(0), + numSubdivPatches(0), numMBSubdivPatches(0), + numUserGeometries(0), numMBUserGeometries(0), + numInstancesCheap(0), numMBInstancesCheap(0), + numInstancesExpensive(0), numMBInstancesExpensive(0), + numGrids(0), numMBGrids(0), + numPoints(0), numMBPoints(0) {} + + __forceinline size_t size() const { + return numTriangles + numQuads + numBezierCurves + numLineSegments + numSubdivPatches + numUserGeometries + numInstancesCheap + numInstancesExpensive + numGrids + numPoints + + numMBTriangles + numMBQuads + numMBBezierCurves + numMBLineSegments + numMBSubdivPatches + numMBUserGeometries + numMBInstancesCheap + numMBInstancesExpensive + numMBGrids + numMBPoints; + } + + __forceinline unsigned int enabledGeometryTypesMask() const + { + unsigned int mask = 0; + if (numTriangles) mask |= 1 << 0; + if (numQuads) mask |= 1 << 1; + if (numBezierCurves+numLineSegments) mask |= 1 << 2; + if (numSubdivPatches) mask |= 1 << 3; + if (numUserGeometries) mask |= 1 << 4; + if (numInstancesCheap) mask |= 1 << 5; + if (numInstancesExpensive) mask |= 1 << 6; + if (numGrids) mask |= 1 << 7; + if (numPoints) mask |= 1 << 8; + + unsigned int maskMB = 0; + if (numMBTriangles) maskMB |= 1 << 0; + if (numMBQuads) maskMB |= 1 << 1; + if (numMBBezierCurves+numMBLineSegments) maskMB |= 1 << 2; + if (numMBSubdivPatches) maskMB |= 1 << 3; + if (numMBUserGeometries) maskMB |= 1 << 4; + if (numMBInstancesCheap) maskMB |= 1 << 5; + if (numMBInstancesExpensive) maskMB |= 1 << 6; + if (numMBGrids) maskMB |= 1 << 7; + if (numMBPoints) maskMB |= 1 << 8; + + return (mask<<8) + maskMB; + } + + __forceinline GeometryCounts operator+ (GeometryCounts const & rhs) const + { + GeometryCounts ret; + ret.numFilterFunctions = numFilterFunctions + rhs.numFilterFunctions; + ret.numTriangles = numTriangles + rhs.numTriangles; + ret.numMBTriangles = numMBTriangles + rhs.numMBTriangles; + ret.numQuads = numQuads + rhs.numQuads; + ret.numMBQuads = numMBQuads + rhs.numMBQuads; + ret.numBezierCurves = numBezierCurves + rhs.numBezierCurves; + ret.numMBBezierCurves = numMBBezierCurves + rhs.numMBBezierCurves; + ret.numLineSegments = numLineSegments + rhs.numLineSegments; + ret.numMBLineSegments = numMBLineSegments + rhs.numMBLineSegments; + ret.numSubdivPatches = numSubdivPatches + rhs.numSubdivPatches; + ret.numMBSubdivPatches = numMBSubdivPatches + rhs.numMBSubdivPatches; + ret.numUserGeometries = numUserGeometries + rhs.numUserGeometries; + ret.numMBUserGeometries = numMBUserGeometries + rhs.numMBUserGeometries; + ret.numInstancesCheap = numInstancesCheap + rhs.numInstancesCheap; + ret.numMBInstancesCheap = numMBInstancesCheap + rhs.numMBInstancesCheap; + ret.numInstancesExpensive = numInstancesExpensive + rhs.numInstancesExpensive; + ret.numMBInstancesExpensive = numMBInstancesExpensive + rhs.numMBInstancesExpensive; + ret.numGrids = numGrids + rhs.numGrids; + ret.numMBGrids = numMBGrids + rhs.numMBGrids; + ret.numPoints = numPoints + rhs.numPoints; + ret.numMBPoints = numMBPoints + rhs.numMBPoints; + + return ret; + } + + size_t numFilterFunctions; //!< number of geometries with filter functions enabled + size_t numTriangles; //!< number of enabled triangles + size_t numMBTriangles; //!< number of enabled motion blured triangles + size_t numQuads; //!< number of enabled quads + size_t numMBQuads; //!< number of enabled motion blurred quads + size_t numBezierCurves; //!< number of enabled curves + size_t numMBBezierCurves; //!< number of enabled motion blurred curves + size_t numLineSegments; //!< number of enabled line segments + size_t numMBLineSegments; //!< number of enabled line motion blurred segments + size_t numSubdivPatches; //!< number of enabled subdivision patches + size_t numMBSubdivPatches; //!< number of enabled motion blured subdivision patches + size_t numUserGeometries; //!< number of enabled user geometries + size_t numMBUserGeometries; //!< number of enabled motion blurred user geometries + size_t numInstancesCheap; //!< number of enabled cheap instances + size_t numMBInstancesCheap; //!< number of enabled motion blurred cheap instances + size_t numInstancesExpensive; //!< number of enabled expensive instances + size_t numMBInstancesExpensive; //!< number of enabled motion blurred expensive instances + size_t numGrids; //!< number of enabled grid geometries + size_t numMBGrids; //!< number of enabled motion blurred grid geometries + size_t numPoints; //!< number of enabled points + size_t numMBPoints; //!< number of enabled motion blurred points + }; + + /*! Base class all geometries are derived from */ + class Geometry : public RefCount + { + friend class Scene; + public: + + /*! type of geometry */ + enum GType + { + GTY_FLAT_LINEAR_CURVE = 0, + GTY_ROUND_LINEAR_CURVE = 1, + GTY_ORIENTED_LINEAR_CURVE = 2, + GTY_CONE_LINEAR_CURVE = 3, + + GTY_FLAT_BEZIER_CURVE = 4, + GTY_ROUND_BEZIER_CURVE = 5, + GTY_ORIENTED_BEZIER_CURVE = 6, + + GTY_FLAT_BSPLINE_CURVE = 8, + GTY_ROUND_BSPLINE_CURVE = 9, + GTY_ORIENTED_BSPLINE_CURVE = 10, + + GTY_FLAT_HERMITE_CURVE = 12, + GTY_ROUND_HERMITE_CURVE = 13, + GTY_ORIENTED_HERMITE_CURVE = 14, + + GTY_FLAT_CATMULL_ROM_CURVE = 16, + GTY_ROUND_CATMULL_ROM_CURVE = 17, + GTY_ORIENTED_CATMULL_ROM_CURVE = 18, + + GTY_TRIANGLE_MESH = 20, + GTY_QUAD_MESH = 21, + GTY_GRID_MESH = 22, + GTY_SUBDIV_MESH = 23, + + GTY_SPHERE_POINT = 25, + GTY_DISC_POINT = 26, + GTY_ORIENTED_DISC_POINT = 27, + + GTY_USER_GEOMETRY = 29, + GTY_INSTANCE_CHEAP = 30, + GTY_INSTANCE_EXPENSIVE = 31, + GTY_END = 32, + + GTY_BASIS_LINEAR = 0, + GTY_BASIS_BEZIER = 4, + GTY_BASIS_BSPLINE = 8, + GTY_BASIS_HERMITE = 12, + GTY_BASIS_CATMULL_ROM = 16, + GTY_BASIS_MASK = 28, + + GTY_SUBTYPE_FLAT_CURVE = 0, + GTY_SUBTYPE_ROUND_CURVE = 1, + GTY_SUBTYPE_ORIENTED_CURVE = 2, + GTY_SUBTYPE_MASK = 3, + }; + + enum GSubType + { + GTY_SUBTYPE_DEFAULT= 0, + GTY_SUBTYPE_INSTANCE_LINEAR = 0, + GTY_SUBTYPE_INSTANCE_QUATERNION = 1 + }; + + enum GTypeMask + { + MTY_FLAT_LINEAR_CURVE = 1ul << GTY_FLAT_LINEAR_CURVE, + MTY_ROUND_LINEAR_CURVE = 1ul << GTY_ROUND_LINEAR_CURVE, + MTY_CONE_LINEAR_CURVE = 1ul << GTY_CONE_LINEAR_CURVE, + MTY_ORIENTED_LINEAR_CURVE = 1ul << GTY_ORIENTED_LINEAR_CURVE, + + MTY_FLAT_BEZIER_CURVE = 1ul << GTY_FLAT_BEZIER_CURVE, + MTY_ROUND_BEZIER_CURVE = 1ul << GTY_ROUND_BEZIER_CURVE, + MTY_ORIENTED_BEZIER_CURVE = 1ul << GTY_ORIENTED_BEZIER_CURVE, + + MTY_FLAT_BSPLINE_CURVE = 1ul << GTY_FLAT_BSPLINE_CURVE, + MTY_ROUND_BSPLINE_CURVE = 1ul << GTY_ROUND_BSPLINE_CURVE, + MTY_ORIENTED_BSPLINE_CURVE = 1ul << GTY_ORIENTED_BSPLINE_CURVE, + + MTY_FLAT_HERMITE_CURVE = 1ul << GTY_FLAT_HERMITE_CURVE, + MTY_ROUND_HERMITE_CURVE = 1ul << GTY_ROUND_HERMITE_CURVE, + MTY_ORIENTED_HERMITE_CURVE = 1ul << GTY_ORIENTED_HERMITE_CURVE, + + MTY_FLAT_CATMULL_ROM_CURVE = 1ul << GTY_FLAT_CATMULL_ROM_CURVE, + MTY_ROUND_CATMULL_ROM_CURVE = 1ul << GTY_ROUND_CATMULL_ROM_CURVE, + MTY_ORIENTED_CATMULL_ROM_CURVE = 1ul << GTY_ORIENTED_CATMULL_ROM_CURVE, + + MTY_CURVE2 = MTY_FLAT_LINEAR_CURVE | MTY_ROUND_LINEAR_CURVE | MTY_CONE_LINEAR_CURVE | MTY_ORIENTED_LINEAR_CURVE, + + MTY_CURVE4 = MTY_FLAT_BEZIER_CURVE | MTY_ROUND_BEZIER_CURVE | MTY_ORIENTED_BEZIER_CURVE | + MTY_FLAT_BSPLINE_CURVE | MTY_ROUND_BSPLINE_CURVE | MTY_ORIENTED_BSPLINE_CURVE | + MTY_FLAT_HERMITE_CURVE | MTY_ROUND_HERMITE_CURVE | MTY_ORIENTED_HERMITE_CURVE | + MTY_FLAT_CATMULL_ROM_CURVE | MTY_ROUND_CATMULL_ROM_CURVE | MTY_ORIENTED_CATMULL_ROM_CURVE, + + MTY_SPHERE_POINT = 1ul << GTY_SPHERE_POINT, + MTY_DISC_POINT = 1ul << GTY_DISC_POINT, + MTY_ORIENTED_DISC_POINT = 1ul << GTY_ORIENTED_DISC_POINT, + + MTY_POINTS = MTY_SPHERE_POINT | MTY_DISC_POINT | MTY_ORIENTED_DISC_POINT, + + MTY_CURVES = MTY_CURVE2 | MTY_CURVE4 | MTY_POINTS, + + MTY_TRIANGLE_MESH = 1ul << GTY_TRIANGLE_MESH, + MTY_QUAD_MESH = 1ul << GTY_QUAD_MESH, + MTY_GRID_MESH = 1ul << GTY_GRID_MESH, + MTY_SUBDIV_MESH = 1ul << GTY_SUBDIV_MESH, + MTY_USER_GEOMETRY = 1ul << GTY_USER_GEOMETRY, + + MTY_INSTANCE_CHEAP = 1ul << GTY_INSTANCE_CHEAP, + MTY_INSTANCE_EXPENSIVE = 1ul << GTY_INSTANCE_EXPENSIVE, + MTY_INSTANCE = MTY_INSTANCE_CHEAP | MTY_INSTANCE_EXPENSIVE + }; + + static const char* gtype_names[GTY_END]; + + enum class State : unsigned { + MODIFIED = 0, + COMMITTED = 1, + }; + + public: + + /*! Geometry constructor */ + Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps); + + /*! Geometry destructor */ + virtual ~Geometry(); + + public: + + /*! tests if geometry is enabled */ + __forceinline bool isEnabled() const { return enabled; } + + /*! tests if geometry is disabled */ + __forceinline bool isDisabled() const { return !isEnabled(); } + + /*! tests if that geometry has some filter function set */ + __forceinline bool hasFilterFunctions () const { + return (intersectionFilterN != nullptr) || (occlusionFilterN != nullptr); + } + + /*! returns geometry type */ + __forceinline GType getType() const { return gtype; } + + /*! returns curve type */ + __forceinline GType getCurveType() const { return (GType)(gtype & GTY_SUBTYPE_MASK); } + + /*! returns curve basis */ + __forceinline GType getCurveBasis() const { return (GType)(gtype & GTY_BASIS_MASK); } + + /*! returns geometry type mask */ + __forceinline GTypeMask getTypeMask() const { return (GTypeMask)(1 << gtype); } + + /*! returns number of primitives */ + __forceinline size_t size() const { return numPrimitives; } + + /*! sets the number of primitives */ + virtual void setNumPrimitives(unsigned int numPrimitives_in); + + /*! sets number of time steps */ + virtual void setNumTimeSteps (unsigned int numTimeSteps_in); + + /*! sets motion blur time range */ + void setTimeRange (const BBox1f range); + + /*! sets number of vertex attributes */ + virtual void setVertexAttributeCount (unsigned int N) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! sets number of topologies */ + virtual void setTopologyCount (unsigned int N) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! sets the build quality */ + void setBuildQuality(RTCBuildQuality quality_in) + { + this->quality = quality_in; + Geometry::update(); + } + + /* calculate time segment itime and fractional time ftime */ + __forceinline int timeSegment(float time, float& ftime) const { + return getTimeSegment(time,time_range.lower,time_range.upper,fnumTimeSegments,ftime); + } + + template + __forceinline vint timeSegment(const vfloat& time, vfloat& ftime) const { + return getTimeSegment(time,vfloat(time_range.lower),vfloat(time_range.upper),vfloat(fnumTimeSegments),ftime); + } + + /* calculate overlapping time segment range */ + __forceinline range timeSegmentRange(const BBox1f& range) const { + return getTimeSegmentRange(range,time_range,fnumTimeSegments); + } + + /* returns time that corresponds to time step */ + __forceinline float timeStep(const int i) const { + assert(i>=0 && i<(int)numTimeSteps); + return time_range.lower + time_range.size()*float(i)/fnumTimeSegments; + } + + /*! for all geometries */ + public: + + /*! Enable geometry. */ + virtual void enable(); + + /*! Update geometry. */ + void update(); + + /*! commit of geometry */ + virtual void commit(); + + /*! Update geometry buffer. */ + virtual void updateBuffer(RTCBufferType type, unsigned int slot) { + update(); // update everything for geometries not supporting this call + } + + /*! Disable geometry. */ + virtual void disable(); + + /*! Verify the geometry */ + virtual bool verify() { return true; } + + /*! called before every build */ + virtual void preCommit(); + + /*! called after every build */ + virtual void postCommit(); + + virtual void addElementsToCount (GeometryCounts & counts) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + }; + + /*! sets constant tessellation rate for the geometry */ + virtual void setTessellationRate(float N) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Sets the maximal curve radius scale allowed by min-width feature. */ + virtual void setMaxRadiusScale(float s) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set user data pointer. */ + virtual void setUserData(void* ptr); + + /*! Get user data pointer. */ + __forceinline void* getUserData() const { + return userPtr; + } + + /*! interpolates user data to the specified u/v location */ + virtual void interpolate(const RTCInterpolateArguments* const args) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! interpolates user data to the specified u/v locations */ + virtual void interpolateN(const RTCInterpolateNArguments* const args); + + /* point query api */ + bool pointQuery(PointQuery* query, PointQueryContext* context); + + /*! for subdivision surfaces only */ + public: + virtual void setSubdivisionMode (unsigned topologyID, RTCSubdivisionMode mode) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual void setVertexAttributeTopology(unsigned int vertexBufferSlot, unsigned int indexBufferSlot) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set displacement function. */ + virtual void setDisplacementFunction (RTCDisplacementFunctionN filter) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getFirstHalfEdge(unsigned int faceID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getFace(unsigned int edgeID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getNextHalfEdge(unsigned int edgeID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getPreviousHalfEdge(unsigned int edgeID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + virtual unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! get fast access to first vertex buffer if applicable */ + virtual float * getCompactVertexArray () const { + return nullptr; + } + + /*! Returns the modified counter - how many times the geo has been modified */ + __forceinline unsigned int getModCounter () const { + return modCounter_; + } + + /*! for triangle meshes and bezier curves only */ + public: + + + /*! Sets ray mask. */ + virtual void setMask(unsigned mask) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Sets specified buffer. */ + virtual void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref& buffer, size_t offset, size_t stride, unsigned int num) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Gets specified buffer. */ + virtual void* getBuffer(RTCBufferType type, unsigned int slot) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set intersection filter function for ray packets of size N. */ + virtual void setIntersectionFilterFunctionN (RTCFilterFunctionN filterN); + + /*! Set occlusion filter function for ray packets of size N. */ + virtual void setOcclusionFilterFunctionN (RTCFilterFunctionN filterN); + + /*! for instances only */ + public: + + /*! Sets the instanced scene */ + virtual void setInstancedScene(const Ref& scene) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Sets transformation of the instance */ + virtual void setTransform(const AffineSpace3fa& transform, unsigned int timeStep) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Sets transformation of the instance */ + virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Returns the transformation of the instance */ + virtual AffineSpace3fa getTransform(float time) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! for user geometries only */ + public: + + /*! Set bounds function. */ + virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set intersect function for ray packets of size N. */ + virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set occlusion function for ray packets of size N. */ + virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded) { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); + } + + /*! Set point query function. */ + void setPointQueryFunction(RTCPointQueryFunction func); + + /*! returns number of time segments */ + __forceinline unsigned numTimeSegments () const { + return numTimeSteps-1; + } + + public: + + virtual PrimInfo createPrimRefArray(mvector& prims, const range& r, size_t k, unsigned int geomID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefArray not implemented for this geometry"); + } + + virtual PrimInfo createPrimRefArrayMB(mvector& prims, size_t itime, const range& r, size_t k, unsigned int geomID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); + } + + virtual PrimInfoMB createPrimRefMBArray(mvector& prims, const BBox1f& t0t1, const range& r, size_t k, unsigned int geomID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); + } + + virtual LinearSpace3fa computeAlignedSpace(const size_t primID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); + } + + virtual LinearSpace3fa computeAlignedSpaceMB(const size_t primID, const BBox1f time_range) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); + } + + virtual Vec3fa computeDirection(unsigned int primID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); + } + + virtual Vec3fa computeDirection(unsigned int primID, size_t time) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); + } + + virtual BBox3fa vbounds(size_t primID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); + } + + virtual BBox3fa vbounds(const LinearSpace3fa& space, size_t primID) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); + } + + virtual BBox3fa vbounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); + } + + virtual LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); + } + + virtual LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); + } + + virtual LBBox3fa vlinearBounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const { + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); + } + + public: + __forceinline bool hasIntersectionFilter() const { return intersectionFilterN != nullptr; } + __forceinline bool hasOcclusionFilter() const { return occlusionFilterN != nullptr; } + + public: + Device* device; //!< device this geometry belongs to + + void* userPtr; //!< user pointer + unsigned int numPrimitives; //!< number of primitives of this geometry + + unsigned int numTimeSteps; //!< number of time steps + float fnumTimeSegments; //!< number of time segments (precalculation) + BBox1f time_range; //!< motion blur time range + + unsigned int mask; //!< for masking out geometry + unsigned int modCounter_ = 1; //!< counter for every modification - used to rebuild scenes when geo is modified + + struct { + GType gtype : 8; //!< geometry type + GSubType gsubtype : 8; //!< geometry subtype + RTCBuildQuality quality : 3; //!< build quality for geometry + unsigned state : 2; + bool enabled : 1; //!< true if geometry is enabled + }; + + RTCFilterFunctionN intersectionFilterN; + RTCFilterFunctionN occlusionFilterN; + RTCPointQueryFunction pointQueryFunc; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/hit.h b/thirdparty/embree-aarch64/kernels/common/hit.h new file mode 100644 index 00000000000..32a198cdfe4 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/hit.h @@ -0,0 +1,114 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "ray.h" +#include "instance_stack.h" + +namespace embree +{ + /* Hit structure for K hits */ + template + struct HitK + { + /* Default construction does nothing */ + __forceinline HitK() {} + + /* Constructs a hit */ + __forceinline HitK(const RTCIntersectContext* context, const vuint& geomID, const vuint& primID, const vfloat& u, const vfloat& v, const Vec3vf& Ng) + : Ng(Ng), u(u), v(v), primID(primID), geomID(geomID) + { + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = RTC_INVALID_GEOMETRY_ID; + instance_id_stack::copy(context->instID, instID); + } + + /* Returns the size of the hit */ + static __forceinline size_t size() { return K; } + + public: + Vec3vf Ng; // geometry normal + vfloat u; // barycentric u coordinate of hit + vfloat v; // barycentric v coordinate of hit + vuint primID; // primitive ID + vuint geomID; // geometry ID + vuint instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID + }; + + /* Specialization for a single hit */ + template<> + struct __aligned(16) HitK<1> + { + /* Default construction does nothing */ + __forceinline HitK() {} + + /* Constructs a hit */ + __forceinline HitK(const RTCIntersectContext* context, unsigned int geomID, unsigned int primID, float u, float v, const Vec3fa& Ng) + : Ng(Ng.x,Ng.y,Ng.z), u(u), v(v), primID(primID), geomID(geomID) + { + instance_id_stack::copy(context->instID, instID); + } + + /* Returns the size of the hit */ + static __forceinline size_t size() { return 1; } + + public: + Vec3 Ng; // geometry normal + float u; // barycentric u coordinate of hit + float v; // barycentric v coordinate of hit + unsigned int primID; // primitive ID + unsigned int geomID; // geometry ID + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID + }; + + /* Shortcuts */ + typedef HitK<1> Hit; + typedef HitK<4> Hit4; + typedef HitK<8> Hit8; + typedef HitK<16> Hit16; + + /* Outputs hit to stream */ + template + __forceinline embree_ostream operator<<(embree_ostream cout, const HitK& ray) + { + cout << "{ " << embree_endl + << " Ng = " << ray.Ng << embree_endl + << " u = " << ray.u << embree_endl + << " v = " << ray.v << embree_endl + << " primID = " << ray.primID << embree_endl + << " geomID = " << ray.geomID << embree_endl + << " instID ="; + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + { + cout << " " << ray.instID[l]; + } + cout << embree_endl; + return cout << "}"; + } + + template + __forceinline void copyHitToRay(RayHit& ray, const Hit& hit) + { + ray.Ng = hit.Ng; + ray.u = hit.u; + ray.v = hit.v; + ray.primID = hit.primID; + ray.geomID = hit.geomID; + instance_id_stack::copy(hit.instID, ray.instID); + } + + template + __forceinline void copyHitToRay(const vbool &mask, RayHitK &ray, const HitK &hit) + { + vfloat::storeu(mask,&ray.Ng.x, hit.Ng.x); + vfloat::storeu(mask,&ray.Ng.y, hit.Ng.y); + vfloat::storeu(mask,&ray.Ng.z, hit.Ng.z); + vfloat::storeu(mask,&ray.u, hit.u); + vfloat::storeu(mask,&ray.v, hit.v); + vuint::storeu(mask,&ray.primID, hit.primID); + vuint::storeu(mask,&ray.geomID, hit.geomID); + instance_id_stack::copy(hit.instID, ray.instID, mask); + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/instance_stack.h b/thirdparty/embree-aarch64/kernels/common/instance_stack.h new file mode 100644 index 00000000000..d7e3637f7b2 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/instance_stack.h @@ -0,0 +1,199 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "rtcore.h" + +namespace embree { +namespace instance_id_stack { + +static_assert(RTC_MAX_INSTANCE_LEVEL_COUNT > 0, + "RTC_MAX_INSTANCE_LEVEL_COUNT must be greater than 0."); + +/******************************************************************************* + * Instance ID stack manipulation. + * This is used from the instance intersector. + ******************************************************************************/ + +/* + * Push an instance to the stack. + */ +RTC_FORCEINLINE bool push(RTCIntersectContext* context, + unsigned instanceId) +{ +#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1 + const bool spaceAvailable = context->instStackSize < RTC_MAX_INSTANCE_LEVEL_COUNT; + /* We assert here because instances are silently dropped when the stack is full. + This might be quite hard to find in production. */ + assert(spaceAvailable); + if (likely(spaceAvailable)) + context->instID[context->instStackSize++] = instanceId; + return spaceAvailable; +#else + const bool spaceAvailable = (context->instID[0] == RTC_INVALID_GEOMETRY_ID); + assert(spaceAvailable); + if (likely(spaceAvailable)) + context->instID[0] = instanceId; + return spaceAvailable; +#endif +} + + +/* + * Pop the last instance pushed to the stack. + * Do not call on an empty stack. + */ +RTC_FORCEINLINE void pop(RTCIntersectContext* context) +{ + assert(context); +#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1 + assert(context->instStackSize > 0); + context->instID[--context->instStackSize] = RTC_INVALID_GEOMETRY_ID; +#else + assert(context->instID[0] != RTC_INVALID_GEOMETRY_ID); + context->instID[0] = RTC_INVALID_GEOMETRY_ID; +#endif +} + +/******************************************************************************* + * Optimized instance id stack copy. + * The copy() function at the bottom of this block will either copy full + * stacks or copy only until the last valid element has been copied, depending + * on RTC_MAX_INSTANCE_LEVEL_COUNT. + ******************************************************************************/ + +/* + * Plain array assignment. This works for scalar->scalar, + * scalar->vector, and vector->vector. + */ +template +RTC_FORCEINLINE void level_copy(unsigned level, Src* src, Tgt* tgt) +{ + tgt[level] = src[level]; +} + +/* + * Masked SIMD vector->vector store. + */ +template +RTC_FORCEINLINE void level_copy(unsigned level, const vuint* src, vuint* tgt, const vbool& mask) +{ + vuint::storeu(mask, tgt + level, src[level]); +} + +/* + * Masked scalar->SIMD vector store. + */ +template +RTC_FORCEINLINE void level_copy(unsigned level, const unsigned* src, vuint* tgt, const vbool& mask) +{ + vuint::store(mask, tgt + level, src[level]); +} + +/* + * Indexed assign from vector to scalar. + */ +template +RTC_FORCEINLINE void level_copy(unsigned level, const vuint* src, unsigned* tgt, const size_t& idx) +{ + tgt[level] = src[level][idx]; +} + +/* + * Indexed assign from scalar to vector. + */ +template +RTC_FORCEINLINE void level_copy(unsigned level, const unsigned* src, vuint* tgt, const size_t& idx) +{ + tgt[level][idx] = src[level]; +} + +/* + * Indexed assign from vector to vector. + */ +template +RTC_FORCEINLINE void level_copy(unsigned level, const vuint* src, vuint* tgt, const size_t& i, const size_t& j) +{ + tgt[level][j] = src[level][i]; +} + +/* + * Check if the given stack level is valid. + * These are only used for large max stack sizes. + */ +RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack) +{ + return stack[level] != RTC_INVALID_GEOMETRY_ID; +} +RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack, const size_t& /*i*/) +{ + return stack[level] != RTC_INVALID_GEOMETRY_ID; +} +template +RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack, const vbool& /*mask*/) +{ + return stack[level] != RTC_INVALID_GEOMETRY_ID; +} + +template +RTC_FORCEINLINE bool level_valid(unsigned level, const vuint* stack) +{ + return any(stack[level] != RTC_INVALID_GEOMETRY_ID); +} +template +RTC_FORCEINLINE bool level_valid(unsigned level, const vuint* stack, const vbool& mask) +{ + return any(mask & (stack[level] != RTC_INVALID_GEOMETRY_ID)); +} + +template +RTC_FORCEINLINE bool level_valid(unsigned level, const vuint* stack, const size_t& i) +{ + return stack[level][i] != RTC_INVALID_GEOMETRY_ID; +} +template +RTC_FORCEINLINE bool level_valid(unsigned level, const vuint* stack, const size_t& i, const size_t& /*j*/) +{ + return stack[level][i] != RTC_INVALID_GEOMETRY_ID; +} + +/* + * Copy an instance ID stack. + * + * This function automatically selects a LevelFunctor from the above Assign + * structs. + */ +template +RTC_FORCEINLINE void copy(Src src, Tgt tgt, Args&&... args) +{ +#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1) + /* + * Avoid all loops for only one level. + */ + level_copy(0, src, tgt, std::forward(args)...); + +#elif (RTC_MAX_INSTANCE_LEVEL_COUNT <= 4) + /* + * It is faster to avoid the valid test for low level counts. + * Just copy the whole stack. + */ + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + level_copy(l, src, tgt, std::forward(args)...); + +#else + /* + * For general stack sizes, it pays off to test for validity. + */ + bool valid = true; + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT && valid; ++l) + { + level_copy(l, src, tgt, std::forward(args)...); + valid = level_valid(l, src, std::forward(args)...); + } +#endif +} + +} // namespace instance_id_stack +} // namespace embree + diff --git a/thirdparty/embree-aarch64/kernels/common/isa.h b/thirdparty/embree-aarch64/kernels/common/isa.h new file mode 100644 index 00000000000..63fb8d33510 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/isa.h @@ -0,0 +1,271 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../../common/sys/platform.h" +#include "../../common/sys/sysinfo.h" + +namespace embree +{ +#define DEFINE_SYMBOL2(type,name) \ + typedef type (*name##Func)(); \ + name##Func name; + +#define DECLARE_SYMBOL2(type,name) \ + namespace sse2 { extern type name(); } \ + namespace sse42 { extern type name(); } \ + namespace avx { extern type name(); } \ + namespace avx2 { extern type name(); } \ + namespace avx512knl { extern type name(); } \ + namespace avx512skx { extern type name(); } \ + void name##_error2() { throw_RTCError(RTC_ERROR_UNKNOWN,"internal error in ISA selection for " TOSTRING(name)); } \ + type name##_error() { return type(name##_error2); } \ + type name##_zero() { return type(nullptr); } + +#define DECLARE_ISA_FUNCTION(type,symbol,args) \ + namespace sse2 { extern type symbol(args); } \ + namespace sse42 { extern type symbol(args); } \ + namespace avx { extern type symbol(args); } \ + namespace avx2 { extern type symbol(args); } \ + namespace avx512knl { extern type symbol(args); } \ + namespace avx512skx { extern type symbol(args); } \ + inline type symbol##_error(args) { throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"function " TOSTRING(symbol) " not supported by your CPU"); } \ + typedef type (*symbol##Ty)(args); \ + +#define DEFINE_ISA_FUNCTION(type,symbol,args) \ + typedef type (*symbol##Func)(args); \ + symbol##Func symbol; + +#define ZERO_SYMBOL(features,intersector) \ + intersector = intersector##_zero; + +#define INIT_SYMBOL(features,intersector) \ + intersector = decltype(intersector)(intersector##_error); + +#define SELECT_SYMBOL_DEFAULT(features,intersector) \ + intersector = isa::intersector; + +#if defined(__SSE__) || defined(__ARM_NEON) +#if !defined(EMBREE_TARGET_SIMD4) +#define EMBREE_TARGET_SIMD4 +#endif +#endif + +#if defined(EMBREE_TARGET_SSE42) +#define SELECT_SYMBOL_SSE42(features,intersector) \ + if ((features & SSE42) == SSE42) intersector = sse42::intersector; +#else +#define SELECT_SYMBOL_SSE42(features,intersector) +#endif + +#if defined(EMBREE_TARGET_AVX) || defined(__AVX__) +#if !defined(EMBREE_TARGET_SIMD8) +#define EMBREE_TARGET_SIMD8 +#endif +#if defined(__AVX__) // if default ISA is >= AVX we treat AVX target as default target +#define SELECT_SYMBOL_AVX(features,intersector) \ + if ((features & ISA) == ISA) intersector = isa::intersector; +#else +#define SELECT_SYMBOL_AVX(features,intersector) \ + if ((features & AVX) == AVX) intersector = avx::intersector; +#endif +#else +#define SELECT_SYMBOL_AVX(features,intersector) +#endif + +#if defined(EMBREE_TARGET_AVX2) +#if !defined(EMBREE_TARGET_SIMD8) +#define EMBREE_TARGET_SIMD8 +#endif +#define SELECT_SYMBOL_AVX2(features,intersector) \ + if ((features & AVX2) == AVX2) intersector = avx2::intersector; +#else +#define SELECT_SYMBOL_AVX2(features,intersector) +#endif + +#if defined(EMBREE_TARGET_AVX512KNL) +#if !defined(EMBREE_TARGET_SIMD16) +#define EMBREE_TARGET_SIMD16 +#endif +#define SELECT_SYMBOL_AVX512KNL(features,intersector) \ + if ((features & AVX512KNL) == AVX512KNL) intersector = avx512knl::intersector; +#else +#define SELECT_SYMBOL_AVX512KNL(features,intersector) +#endif + +#if defined(EMBREE_TARGET_AVX512SKX) +#if !defined(EMBREE_TARGET_SIMD16) +#define EMBREE_TARGET_SIMD16 +#endif +#define SELECT_SYMBOL_AVX512SKX(features,intersector) \ + if ((features & AVX512SKX) == AVX512SKX) intersector = avx512skx::intersector; +#else +#define SELECT_SYMBOL_AVX512SKX(features,intersector) +#endif + +#define SELECT_SYMBOL_DEFAULT_SSE42(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX2(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX2(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX512KNL_AVX512SKX(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_ZERO_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ + ZERO_SYMBOL(features,intersector); \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \ + SELECT_SYMBOL_DEFAULT(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,intersector) \ + INIT_SYMBOL(features,intersector); \ + SELECT_SYMBOL_AVX512KNL(features,intersector); \ + SELECT_SYMBOL_AVX512SKX(features,intersector); + +#define SELECT_SYMBOL_SSE42_AVX_AVX2(features,intersector) \ + SELECT_SYMBOL_SSE42(features,intersector); \ + SELECT_SYMBOL_AVX(features,intersector); \ + SELECT_SYMBOL_AVX2(features,intersector); + + struct VerifyMultiTargetLinking { + static __noinline int getISA(int depth = 5) { + if (depth == 0) return ISA; + else return getISA(depth-1); + } + }; + namespace sse2 { int getISA(); }; + namespace sse42 { int getISA(); }; + namespace avx { int getISA(); }; + namespace avx2 { int getISA(); }; + namespace avx512knl { int getISA(); }; + namespace avx512skx { int getISA(); }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/motion_derivative.h b/thirdparty/embree-aarch64/kernels/common/motion_derivative.h new file mode 100644 index 00000000000..82953f0e89a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/motion_derivative.h @@ -0,0 +1,325 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../../common/math/affinespace.h" +#include "../../common/math/interval.h" + +#include + +namespace embree { + +#define MOTION_DERIVATIVE_ROOT_EPSILON 1e-4f + +static void motion_derivative_coefficients(const float *p, float *coeff); + +struct MotionDerivativeCoefficients +{ + float theta; + float coeffs[3*8*7]; + + MotionDerivativeCoefficients() {} + + // xfm0 and xfm1 are interpret as quaternion decomposition + MotionDerivativeCoefficients(AffineSpace3ff const& xfm0, AffineSpace3ff const& xfm1) + { + // cosTheta of the two quaternions + const float cosTheta = min(1.f, max(-1.f, + xfm0.l.vx.w * xfm1.l.vx.w + + xfm0.l.vy.w * xfm1.l.vy.w + + xfm0.l.vz.w * xfm1.l.vz.w + + xfm0.p.w * xfm1.p.w)); + + theta = std::acos(cosTheta); + Vec4f qperp(xfm1.p.w, xfm1.l.vx.w, xfm1.l.vy.w, xfm1.l.vz.w); + if (cosTheta < 0.995f) { + // compute perpendicular quaternion + qperp.x = xfm1.p.w - cosTheta * xfm0.p.w; + qperp.y = xfm1.l.vx.w - cosTheta * xfm0.l.vx.w; + qperp.z = xfm1.l.vy.w - cosTheta * xfm0.l.vy.w; + qperp.w = xfm1.l.vz.w - cosTheta * xfm0.l.vz.w; + qperp = normalize(qperp); + } + const float p[33] = { + theta, + xfm0.l.vx.y, xfm0.l.vx.z, xfm0.l.vy.z, // translation component of xfm0 + xfm1.l.vx.y, xfm1.l.vx.z, xfm1.l.vy.z, // translation component of xfm1 + xfm0.p.w, xfm0.l.vx.w, xfm0.l.vy.w, xfm0.l.vz.w, // quaternion of xfm0 + qperp.x, qperp.y, qperp.z, qperp.w, + xfm0.l.vx.x, xfm0.l.vy.x, xfm0.l.vz.x, xfm0.p.x, // scale/skew component of xfm0 + xfm0.l.vy.y, xfm0.l.vz.y, xfm0.p.y, + xfm0.l.vz.z, xfm0.p.z, + xfm1.l.vx.x, xfm1.l.vy.x, xfm1.l.vz.x, xfm1.p.x, // scale/skew component of xfm1 + xfm1.l.vy.y, xfm1.l.vz.y, xfm1.p.y, + xfm1.l.vz.z, xfm1.p.z + }; + motion_derivative_coefficients(p, coeffs); + } +}; + +struct MotionDerivative +{ + float twoTheta; + float c[8]; + + MotionDerivative(MotionDerivativeCoefficients const& mdc, + int dim, Vec3fa const& p0, Vec3fa const& p1) + : twoTheta(2.f*mdc.theta) + { + const float p[7] = { 1, p0.x, p0.y, p0.z, p1.x, p1.y, p1.z }; + for (int i = 0; i < 8; ++i) { + c[i] = 0; + for (int j = 0; j < 7; ++j) { + c[i] += mdc.coeffs[8*7*dim + i*7 + j] * p[j]; + } + } + } + + template + struct EvalMotionDerivative + { + MotionDerivative const& md; + float offset; + + EvalMotionDerivative(MotionDerivative const& md, float offset) : md(md), offset(offset) {} + + T operator()(T const& time) const { + return md.c[0] + md.c[1] * time + + (md.c[2] + md.c[3] * time + md.c[4] * time * time) * cos(md.twoTheta * time) + + (md.c[5] + md.c[6] * time + md.c[7] * time * time) * sin(md.twoTheta * time) + + offset; + } + }; + + unsigned int findRoots( + Interval1f const& interval, + float offset, + float* roots, + unsigned int maxNumRoots) + { + unsigned int numRoots = 0; + EvalMotionDerivative eval(*this, offset); + findRoots(eval, interval, numRoots, roots, maxNumRoots); + return numRoots; + } + + template + static void findRoots( + + Eval const& eval, + Interval1f const& interval, + unsigned int& numRoots, + float* roots, + unsigned int maxNumRoots) + { + Interval1f range = eval(interval); + if (range.lower > 0 || range.upper < 0 || range.lower >= range.upper) return; + + const float split = 0.5f * (interval.upper + interval.lower); + if (interval.upper-interval.lower < 1e-7f || abs(split-interval.lower) < 1e-7f || abs(split-interval.upper) < 1e-7f) + { + // check if the root already exists + for (unsigned int k = 0; k < numRoots && k < maxNumRoots; ++k) { + if (abs(roots[k]-split) < MOTION_DERIVATIVE_ROOT_EPSILON) + return; + } + if (numRoots < maxNumRoots) { + roots[numRoots++] = split; + } + if (numRoots > maxNumRoots) { + printf("error: more roots than expected\n"); // FIXME: workaround for ICC2019.4 compiler bug under macOS + return; + } + return; + } + + findRoots(eval, Interval1f(interval.lower, split), numRoots, roots, maxNumRoots); + findRoots(eval, Interval1f(split, interval.upper), numRoots, roots, maxNumRoots); + } +}; + +/****************************************************************************** + * Code generated with sympy 1.4 * + * See http://www.sympy.org/ for more information. * + * * + * see * + * * + * scripts/generate_motion_derivative_coefficients.py * + * * + * for how this code is generated * + * * + ******************************************************************************/ +static void motion_derivative_coefficients(const float *p, float *coeff) +{ + coeff[0] = -p[1] + p[4] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27] - p[18] + p[27]; + coeff[1] = 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - p[14]*p[14]*p[24] - 2*p[15] + p[24]; + coeff[2] = 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - p[14]*p[14]*p[25] - 2*p[16] + p[25]; + coeff[3] = -2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - p[14]*p[14]*p[26] - 2*p[17] + p[26]; + coeff[4] = (-p[9]*p[9] - p[10]*p[10] - p[13]*p[13] - p[14]*p[14] + 1)*p[15]; + coeff[5] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] - p[11]*p[14]*p[19] + p[12]*p[13]*p[19] - p[13]*p[13]*p[16] - p[14]*p[14]*p[16] + p[16]; + coeff[6] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] + p[11]*p[13]*p[22] - p[11]*p[14]*p[20] + p[12]*p[13]*p[20] + p[12]*p[14]*p[22] - p[13]*p[13]*p[17] - p[14]*p[14]*p[17] + p[17]; + coeff[7] = 0; + coeff[8] = -2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24] + 2*p[15] - 2*p[24]; + coeff[9] = -2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25] + 2*p[16] - 2*p[25]; + coeff[10] = 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26] + 2*p[17] - 2*p[26]; + coeff[11] = 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24] - 2*p[15] + 2*p[24]; + coeff[12] = 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25] - 2*p[16] + 2*p[25]; + coeff[13] = -2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26] - 2*p[17] + 2*p[26]; + coeff[14] = 2*p[0]*p[7]*p[11]*p[18] + 2*p[0]*p[7]*p[13]*p[23] - 2*p[0]*p[7]*p[14]*p[21] + 2*p[0]*p[8]*p[12]*p[18] + 2*p[0]*p[8]*p[13]*p[21] + 2*p[0]*p[8]*p[14]*p[23] + 2*p[0]*p[9]*p[11]*p[23] + 2*p[0]*p[9]*p[12]*p[21] - 2*p[0]*p[9]*p[13]*p[18] - 2*p[0]*p[10]*p[11]*p[21] + 2*p[0]*p[10]*p[12]*p[23] - 2*p[0]*p[10]*p[14]*p[18] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] + p[11]*p[13]*p[23] - p[11]*p[13]*p[32] - p[11]*p[14]*p[21] + p[11]*p[14]*p[30] + p[12]*p[13]*p[21] - p[12]*p[13]*p[30] + p[12]*p[14]*p[23] - p[12]*p[14]*p[32] - p[13]*p[13]*p[18] + p[13]*p[13]*p[27] - p[14]*p[14]*p[18] + p[14]*p[14]*p[27]; + coeff[15] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + p[14]*p[14]*p[24]; + coeff[16] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + p[14]*p[14]*p[25]; + coeff[17] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + p[14]*p[14]*p[26]; + coeff[18] = (-p[9]*p[9] - p[10]*p[10] + p[13]*p[13] + p[14]*p[14])*p[15]; + coeff[19] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] + p[11]*p[14]*p[19] - p[12]*p[13]*p[19] + p[13]*p[13]*p[16] + p[14]*p[14]*p[16]; + coeff[20] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] - p[11]*p[13]*p[22] + p[11]*p[14]*p[20] - p[12]*p[13]*p[20] - p[12]*p[14]*p[22] + p[13]*p[13]*p[17] + p[14]*p[14]*p[17]; + coeff[21] = 2*(-p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27])*p[0]; + coeff[22] = -4*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[7]*p[11]*p[24] - 4*p[0]*p[8]*p[12]*p[15] + 2*p[0]*p[8]*p[12]*p[24] + 4*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[9]*p[13]*p[24] + 4*p[0]*p[10]*p[14]*p[15] - 2*p[0]*p[10]*p[14]*p[24] - 2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24]; + coeff[23] = -4*p[0]*p[7]*p[11]*p[16] + 2*p[0]*p[7]*p[11]*p[25] + 4*p[0]*p[7]*p[14]*p[19] - 2*p[0]*p[7]*p[14]*p[28] - 4*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[12]*p[25] - 4*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[8]*p[13]*p[28] - 4*p[0]*p[9]*p[12]*p[19] + 2*p[0]*p[9]*p[12]*p[28] + 4*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[9]*p[13]*p[25] + 4*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[11]*p[28] + 4*p[0]*p[10]*p[14]*p[16] - 2*p[0]*p[10]*p[14]*p[25] - 2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25]; + coeff[24] = -4*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[11]*p[26] - 4*p[0]*p[7]*p[13]*p[22] + 2*p[0]*p[7]*p[13]*p[31] + 4*p[0]*p[7]*p[14]*p[20] - 2*p[0]*p[7]*p[14]*p[29] - 4*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[12]*p[26] - 4*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[13]*p[29] - 4*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[8]*p[14]*p[31] - 4*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[11]*p[31] - 4*p[0]*p[9]*p[12]*p[20] + 2*p[0]*p[9]*p[12]*p[29] + 4*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[9]*p[13]*p[26] + 4*p[0]*p[10]*p[11]*p[20] - 2*p[0]*p[10]*p[11]*p[29] - 4*p[0]*p[10]*p[12]*p[22] + 2*p[0]*p[10]*p[12]*p[31] + 4*p[0]*p[10]*p[14]*p[17] - 2*p[0]*p[10]*p[14]*p[26] + 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26]; + coeff[25] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24]; + coeff[26] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25]; + coeff[27] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26]; + coeff[28] = 0; + coeff[29] = 2*(p[7]*p[11]*p[15] - p[7]*p[11]*p[24] + p[8]*p[12]*p[15] - p[8]*p[12]*p[24] - p[9]*p[13]*p[15] + p[9]*p[13]*p[24] - p[10]*p[14]*p[15] + p[10]*p[14]*p[24])*p[0]; + coeff[30] = 2*(p[7]*p[11]*p[16] - p[7]*p[11]*p[25] - p[7]*p[14]*p[19] + p[7]*p[14]*p[28] + p[8]*p[12]*p[16] - p[8]*p[12]*p[25] + p[8]*p[13]*p[19] - p[8]*p[13]*p[28] + p[9]*p[12]*p[19] - p[9]*p[12]*p[28] - p[9]*p[13]*p[16] + p[9]*p[13]*p[25] - p[10]*p[11]*p[19] + p[10]*p[11]*p[28] - p[10]*p[14]*p[16] + p[10]*p[14]*p[25])*p[0]; + coeff[31] = 2*(p[7]*p[11]*p[17] - p[7]*p[11]*p[26] + p[7]*p[13]*p[22] - p[7]*p[13]*p[31] - p[7]*p[14]*p[20] + p[7]*p[14]*p[29] + p[8]*p[12]*p[17] - p[8]*p[12]*p[26] + p[8]*p[13]*p[20] - p[8]*p[13]*p[29] + p[8]*p[14]*p[22] - p[8]*p[14]*p[31] + p[9]*p[11]*p[22] - p[9]*p[11]*p[31] + p[9]*p[12]*p[20] - p[9]*p[12]*p[29] - p[9]*p[13]*p[17] + p[9]*p[13]*p[26] - p[10]*p[11]*p[20] + p[10]*p[11]*p[29] + p[10]*p[12]*p[22] - p[10]*p[12]*p[31] - p[10]*p[14]*p[17] + p[10]*p[14]*p[26])*p[0]; + coeff[32] = 2*(-p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + p[10]*p[14]*p[15] - p[10]*p[14]*p[24])*p[0]; + coeff[33] = 2*(-p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + p[10]*p[14]*p[16] - p[10]*p[14]*p[25])*p[0]; + coeff[34] = 2*(-p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + p[10]*p[14]*p[17] - p[10]*p[14]*p[26])*p[0]; + coeff[35] = -2*p[0]*p[7]*p[9]*p[23] + 2*p[0]*p[7]*p[10]*p[21] - 2*p[0]*p[8]*p[9]*p[21] - 2*p[0]*p[8]*p[10]*p[23] + 2*p[0]*p[9]*p[9]*p[18] + 2*p[0]*p[10]*p[10]*p[18] + 2*p[0]*p[11]*p[13]*p[23] - 2*p[0]*p[11]*p[14]*p[21] + 2*p[0]*p[12]*p[13]*p[21] + 2*p[0]*p[12]*p[14]*p[23] - 2*p[0]*p[13]*p[13]*p[18] - 2*p[0]*p[14]*p[14]*p[18] - p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27]; + coeff[36] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - p[10]*p[14]*p[24]; + coeff[37] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - p[10]*p[14]*p[25]; + coeff[38] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - p[10]*p[14]*p[26]; + coeff[39] = (p[7]*p[11] + p[8]*p[12] - p[9]*p[13] - p[10]*p[14])*p[15]; + coeff[40] = p[7]*p[11]*p[16] - p[7]*p[14]*p[19] + p[8]*p[12]*p[16] + p[8]*p[13]*p[19] + p[9]*p[12]*p[19] - p[9]*p[13]*p[16] - p[10]*p[11]*p[19] - p[10]*p[14]*p[16]; + coeff[41] = p[7]*p[11]*p[17] + p[7]*p[13]*p[22] - p[7]*p[14]*p[20] + p[8]*p[12]*p[17] + p[8]*p[13]*p[20] + p[8]*p[14]*p[22] + p[9]*p[11]*p[22] + p[9]*p[12]*p[20] - p[9]*p[13]*p[17] - p[10]*p[11]*p[20] + p[10]*p[12]*p[22] - p[10]*p[14]*p[17]; + coeff[42] = 2*(p[7]*p[9]*p[23] - p[7]*p[9]*p[32] - p[7]*p[10]*p[21] + p[7]*p[10]*p[30] + p[8]*p[9]*p[21] - p[8]*p[9]*p[30] + p[8]*p[10]*p[23] - p[8]*p[10]*p[32] - p[9]*p[9]*p[18] + p[9]*p[9]*p[27] - p[10]*p[10]*p[18] + p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27])*p[0]; + coeff[43] = -4*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[9]*p[9]*p[24] - 4*p[0]*p[10]*p[10]*p[15] + 2*p[0]*p[10]*p[10]*p[24] + 4*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[13]*p[13]*p[24] + 4*p[0]*p[14]*p[14]*p[15] - 2*p[0]*p[14]*p[14]*p[24] + 2*p[7]*p[11]*p[15] - 2*p[7]*p[11]*p[24] + 2*p[8]*p[12]*p[15] - 2*p[8]*p[12]*p[24] - 2*p[9]*p[13]*p[15] + 2*p[9]*p[13]*p[24] - 2*p[10]*p[14]*p[15] + 2*p[10]*p[14]*p[24]; + coeff[44] = -4*p[0]*p[7]*p[10]*p[19] + 2*p[0]*p[7]*p[10]*p[28] + 4*p[0]*p[8]*p[9]*p[19] - 2*p[0]*p[8]*p[9]*p[28] - 4*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[9]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[16] + 2*p[0]*p[10]*p[10]*p[25] + 4*p[0]*p[11]*p[14]*p[19] - 2*p[0]*p[11]*p[14]*p[28] - 4*p[0]*p[12]*p[13]*p[19] + 2*p[0]*p[12]*p[13]*p[28] + 4*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[13]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[16] - 2*p[0]*p[14]*p[14]*p[25] + 2*p[7]*p[11]*p[16] - 2*p[7]*p[11]*p[25] - 2*p[7]*p[14]*p[19] + 2*p[7]*p[14]*p[28] + 2*p[8]*p[12]*p[16] - 2*p[8]*p[12]*p[25] + 2*p[8]*p[13]*p[19] - 2*p[8]*p[13]*p[28] + 2*p[9]*p[12]*p[19] - 2*p[9]*p[12]*p[28] - 2*p[9]*p[13]*p[16] + 2*p[9]*p[13]*p[25] - 2*p[10]*p[11]*p[19] + 2*p[10]*p[11]*p[28] - 2*p[10]*p[14]*p[16] + 2*p[10]*p[14]*p[25]; + coeff[45] = 4*p[0]*p[7]*p[9]*p[22] - 2*p[0]*p[7]*p[9]*p[31] - 4*p[0]*p[7]*p[10]*p[20] + 2*p[0]*p[7]*p[10]*p[29] + 4*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[9]*p[29] + 4*p[0]*p[8]*p[10]*p[22] - 2*p[0]*p[8]*p[10]*p[31] - 4*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[9]*p[9]*p[26] - 4*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[10]*p[10]*p[26] - 4*p[0]*p[11]*p[13]*p[22] + 2*p[0]*p[11]*p[13]*p[31] + 4*p[0]*p[11]*p[14]*p[20] - 2*p[0]*p[11]*p[14]*p[29] - 4*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[13]*p[29] - 4*p[0]*p[12]*p[14]*p[22] + 2*p[0]*p[12]*p[14]*p[31] + 4*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[13]*p[13]*p[26] + 4*p[0]*p[14]*p[14]*p[17] - 2*p[0]*p[14]*p[14]*p[26] + 2*p[7]*p[11]*p[17] - 2*p[7]*p[11]*p[26] + 2*p[7]*p[13]*p[22] - 2*p[7]*p[13]*p[31] - 2*p[7]*p[14]*p[20] + 2*p[7]*p[14]*p[29] + 2*p[8]*p[12]*p[17] - 2*p[8]*p[12]*p[26] + 2*p[8]*p[13]*p[20] - 2*p[8]*p[13]*p[29] + 2*p[8]*p[14]*p[22] - 2*p[8]*p[14]*p[31] + 2*p[9]*p[11]*p[22] - 2*p[9]*p[11]*p[31] + 2*p[9]*p[12]*p[20] - 2*p[9]*p[12]*p[29] - 2*p[9]*p[13]*p[17] + 2*p[9]*p[13]*p[26] - 2*p[10]*p[11]*p[20] + 2*p[10]*p[11]*p[29] + 2*p[10]*p[12]*p[22] - 2*p[10]*p[12]*p[31] - 2*p[10]*p[14]*p[17] + 2*p[10]*p[14]*p[26]; + coeff[46] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + 2*p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + 2*p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - 2*p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - 2*p[10]*p[14]*p[24]; + coeff[47] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + 2*p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - 2*p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + 2*p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + 2*p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + 2*p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - 2*p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - 2*p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - 2*p[10]*p[14]*p[25]; + coeff[48] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + 2*p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + 2*p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - 2*p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + 2*p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + 2*p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + 2*p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + 2*p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + 2*p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - 2*p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - 2*p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + 2*p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - 2*p[10]*p[14]*p[26]; + coeff[49] = 0; + coeff[50] = 2*(p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - p[14]*p[14]*p[15] + p[14]*p[14]*p[24])*p[0]; + coeff[51] = 2*(p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - p[14]*p[14]*p[16] + p[14]*p[14]*p[25])*p[0]; + coeff[52] = 2*(-p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - p[14]*p[14]*p[17] + p[14]*p[14]*p[26])*p[0]; + coeff[53] = 2*(-p[9]*p[9]*p[15] + p[9]*p[9]*p[24] - p[10]*p[10]*p[15] + p[10]*p[10]*p[24] + p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + p[14]*p[14]*p[15] - p[14]*p[14]*p[24])*p[0]; + coeff[54] = 2*(-p[7]*p[10]*p[19] + p[7]*p[10]*p[28] + p[8]*p[9]*p[19] - p[8]*p[9]*p[28] - p[9]*p[9]*p[16] + p[9]*p[9]*p[25] - p[10]*p[10]*p[16] + p[10]*p[10]*p[25] + p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + p[14]*p[14]*p[16] - p[14]*p[14]*p[25])*p[0]; + coeff[55] = 2*(p[7]*p[9]*p[22] - p[7]*p[9]*p[31] - p[7]*p[10]*p[20] + p[7]*p[10]*p[29] + p[8]*p[9]*p[20] - p[8]*p[9]*p[29] + p[8]*p[10]*p[22] - p[8]*p[10]*p[31] - p[9]*p[9]*p[17] + p[9]*p[9]*p[26] - p[10]*p[10]*p[17] + p[10]*p[10]*p[26] - p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + p[14]*p[14]*p[17] - p[14]*p[14]*p[26])*p[0]; + coeff[56] = -p[2] + p[5] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30] - p[21] + p[30]; + coeff[57] = -2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + p[12]*p[13]*p[24]; + coeff[58] = -2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - p[14]*p[14]*p[28] - 2*p[19] + p[28]; + coeff[59] = 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - p[14]*p[14]*p[29] - 2*p[20] + p[29]; + coeff[60] = (p[7]*p[10] + p[8]*p[9] + p[11]*p[14] + p[12]*p[13])*p[15]; + coeff[61] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] + p[11]*p[14]*p[16] - p[12]*p[12]*p[19] + p[12]*p[13]*p[16] - p[14]*p[14]*p[19] + p[19]; + coeff[62] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] - p[11]*p[12]*p[22] + p[11]*p[14]*p[17] - p[12]*p[12]*p[20] + p[12]*p[13]*p[17] + p[13]*p[14]*p[22] - p[14]*p[14]*p[20] + p[20]; + coeff[63] = 0; + coeff[64] = 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24]; + coeff[65] = 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28] + 2*p[19] - 2*p[28]; + coeff[66] = -2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29] + 2*p[20] - 2*p[29]; + coeff[67] = -2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24]; + coeff[68] = -2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28] - 2*p[19] + 2*p[28]; + coeff[69] = 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29] - 2*p[20] + 2*p[29]; + coeff[70] = 2*p[0]*p[7]*p[11]*p[21] - 2*p[0]*p[7]*p[12]*p[23] + 2*p[0]*p[7]*p[14]*p[18] - 2*p[0]*p[8]*p[11]*p[23] - 2*p[0]*p[8]*p[12]*p[21] + 2*p[0]*p[8]*p[13]*p[18] + 2*p[0]*p[9]*p[12]*p[18] + 2*p[0]*p[9]*p[13]*p[21] + 2*p[0]*p[9]*p[14]*p[23] + 2*p[0]*p[10]*p[11]*p[18] + 2*p[0]*p[10]*p[13]*p[23] - 2*p[0]*p[10]*p[14]*p[21] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] - p[11]*p[12]*p[23] + p[11]*p[12]*p[32] + p[11]*p[14]*p[18] - p[11]*p[14]*p[27] - p[12]*p[12]*p[21] + p[12]*p[12]*p[30] + p[12]*p[13]*p[18] - p[12]*p[13]*p[27] + p[13]*p[14]*p[23] - p[13]*p[14]*p[32] - p[14]*p[14]*p[21] + p[14]*p[14]*p[30]; + coeff[71] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - p[12]*p[13]*p[24]; + coeff[72] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + p[14]*p[14]*p[28]; + coeff[73] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + p[14]*p[14]*p[29]; + coeff[74] = (p[7]*p[10] + p[8]*p[9] - p[11]*p[14] - p[12]*p[13])*p[15]; + coeff[75] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] - p[11]*p[14]*p[16] + p[12]*p[12]*p[19] - p[12]*p[13]*p[16] + p[14]*p[14]*p[19]; + coeff[76] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] + p[11]*p[12]*p[22] - p[11]*p[14]*p[17] + p[12]*p[12]*p[20] - p[12]*p[13]*p[17] - p[13]*p[14]*p[22] + p[14]*p[14]*p[20]; + coeff[77] = 2*(-p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30])*p[0]; + coeff[78] = -4*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[7]*p[14]*p[24] - 4*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[8]*p[13]*p[24] - 4*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[9]*p[12]*p[24] - 4*p[0]*p[10]*p[11]*p[15] + 2*p[0]*p[10]*p[11]*p[24] + 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24]; + coeff[79] = -4*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[11]*p[28] - 4*p[0]*p[7]*p[14]*p[16] + 2*p[0]*p[7]*p[14]*p[25] + 4*p[0]*p[8]*p[12]*p[19] - 2*p[0]*p[8]*p[12]*p[28] - 4*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[8]*p[13]*p[25] - 4*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[12]*p[25] - 4*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[9]*p[13]*p[28] - 4*p[0]*p[10]*p[11]*p[16] + 2*p[0]*p[10]*p[11]*p[25] + 4*p[0]*p[10]*p[14]*p[19] - 2*p[0]*p[10]*p[14]*p[28] + 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28]; + coeff[80] = -4*p[0]*p[7]*p[11]*p[20] + 2*p[0]*p[7]*p[11]*p[29] + 4*p[0]*p[7]*p[12]*p[22] - 2*p[0]*p[7]*p[12]*p[31] - 4*p[0]*p[7]*p[14]*p[17] + 2*p[0]*p[7]*p[14]*p[26] + 4*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[11]*p[31] + 4*p[0]*p[8]*p[12]*p[20] - 2*p[0]*p[8]*p[12]*p[29] - 4*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[8]*p[13]*p[26] - 4*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[12]*p[26] - 4*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[13]*p[29] - 4*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[9]*p[14]*p[31] - 4*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[11]*p[26] - 4*p[0]*p[10]*p[13]*p[22] + 2*p[0]*p[10]*p[13]*p[31] + 4*p[0]*p[10]*p[14]*p[20] - 2*p[0]*p[10]*p[14]*p[29] - 2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29]; + coeff[81] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24]; + coeff[82] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28]; + coeff[83] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29]; + coeff[84] = 0; + coeff[85] = 2*(p[7]*p[14]*p[15] - p[7]*p[14]*p[24] + p[8]*p[13]*p[15] - p[8]*p[13]*p[24] + p[9]*p[12]*p[15] - p[9]*p[12]*p[24] + p[10]*p[11]*p[15] - p[10]*p[11]*p[24])*p[0]; + coeff[86] = 2*(p[7]*p[11]*p[19] - p[7]*p[11]*p[28] + p[7]*p[14]*p[16] - p[7]*p[14]*p[25] - p[8]*p[12]*p[19] + p[8]*p[12]*p[28] + p[8]*p[13]*p[16] - p[8]*p[13]*p[25] + p[9]*p[12]*p[16] - p[9]*p[12]*p[25] + p[9]*p[13]*p[19] - p[9]*p[13]*p[28] + p[10]*p[11]*p[16] - p[10]*p[11]*p[25] - p[10]*p[14]*p[19] + p[10]*p[14]*p[28])*p[0]; + coeff[87] = 2*(p[7]*p[11]*p[20] - p[7]*p[11]*p[29] - p[7]*p[12]*p[22] + p[7]*p[12]*p[31] + p[7]*p[14]*p[17] - p[7]*p[14]*p[26] - p[8]*p[11]*p[22] + p[8]*p[11]*p[31] - p[8]*p[12]*p[20] + p[8]*p[12]*p[29] + p[8]*p[13]*p[17] - p[8]*p[13]*p[26] + p[9]*p[12]*p[17] - p[9]*p[12]*p[26] + p[9]*p[13]*p[20] - p[9]*p[13]*p[29] + p[9]*p[14]*p[22] - p[9]*p[14]*p[31] + p[10]*p[11]*p[17] - p[10]*p[11]*p[26] + p[10]*p[13]*p[22] - p[10]*p[13]*p[31] - p[10]*p[14]*p[20] + p[10]*p[14]*p[29])*p[0]; + coeff[88] = 2*(-p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - p[10]*p[11]*p[15] + p[10]*p[11]*p[24])*p[0]; + coeff[89] = 2*(-p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + p[10]*p[14]*p[19] - p[10]*p[14]*p[28])*p[0]; + coeff[90] = 2*(-p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + p[10]*p[14]*p[20] - p[10]*p[14]*p[29])*p[0]; + coeff[91] = 2*p[0]*p[7]*p[8]*p[23] - 2*p[0]*p[7]*p[10]*p[18] + 2*p[0]*p[8]*p[8]*p[21] - 2*p[0]*p[8]*p[9]*p[18] - 2*p[0]*p[9]*p[10]*p[23] + 2*p[0]*p[10]*p[10]*p[21] - 2*p[0]*p[11]*p[12]*p[23] + 2*p[0]*p[11]*p[14]*p[18] - 2*p[0]*p[12]*p[12]*p[21] + 2*p[0]*p[12]*p[13]*p[18] + 2*p[0]*p[13]*p[14]*p[23] - 2*p[0]*p[14]*p[14]*p[21] - p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30]; + coeff[92] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + p[10]*p[11]*p[24]; + coeff[93] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - p[10]*p[14]*p[28]; + coeff[94] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - p[10]*p[14]*p[29]; + coeff[95] = (p[7]*p[14] + p[8]*p[13] + p[9]*p[12] + p[10]*p[11])*p[15]; + coeff[96] = p[7]*p[11]*p[19] + p[7]*p[14]*p[16] - p[8]*p[12]*p[19] + p[8]*p[13]*p[16] + p[9]*p[12]*p[16] + p[9]*p[13]*p[19] + p[10]*p[11]*p[16] - p[10]*p[14]*p[19]; + coeff[97] = p[7]*p[11]*p[20] - p[7]*p[12]*p[22] + p[7]*p[14]*p[17] - p[8]*p[11]*p[22] - p[8]*p[12]*p[20] + p[8]*p[13]*p[17] + p[9]*p[12]*p[17] + p[9]*p[13]*p[20] + p[9]*p[14]*p[22] + p[10]*p[11]*p[17] + p[10]*p[13]*p[22] - p[10]*p[14]*p[20]; + coeff[98] = 2*(-p[7]*p[8]*p[23] + p[7]*p[8]*p[32] + p[7]*p[10]*p[18] - p[7]*p[10]*p[27] - p[8]*p[8]*p[21] + p[8]*p[8]*p[30] + p[8]*p[9]*p[18] - p[8]*p[9]*p[27] + p[9]*p[10]*p[23] - p[9]*p[10]*p[32] - p[10]*p[10]*p[21] + p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30])*p[0]; + coeff[99] = 4*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[7]*p[10]*p[24] + 4*p[0]*p[8]*p[9]*p[15] - 2*p[0]*p[8]*p[9]*p[24] - 4*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[11]*p[14]*p[24] - 4*p[0]*p[12]*p[13]*p[15] + 2*p[0]*p[12]*p[13]*p[24] + 2*p[7]*p[14]*p[15] - 2*p[7]*p[14]*p[24] + 2*p[8]*p[13]*p[15] - 2*p[8]*p[13]*p[24] + 2*p[9]*p[12]*p[15] - 2*p[9]*p[12]*p[24] + 2*p[10]*p[11]*p[15] - 2*p[10]*p[11]*p[24]; + coeff[100] = 4*p[0]*p[7]*p[10]*p[16] - 2*p[0]*p[7]*p[10]*p[25] - 4*p[0]*p[8]*p[8]*p[19] + 2*p[0]*p[8]*p[8]*p[28] + 4*p[0]*p[8]*p[9]*p[16] - 2*p[0]*p[8]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[10]*p[10]*p[28] - 4*p[0]*p[11]*p[14]*p[16] + 2*p[0]*p[11]*p[14]*p[25] + 4*p[0]*p[12]*p[12]*p[19] - 2*p[0]*p[12]*p[12]*p[28] - 4*p[0]*p[12]*p[13]*p[16] + 2*p[0]*p[12]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[19] - 2*p[0]*p[14]*p[14]*p[28] + 2*p[7]*p[11]*p[19] - 2*p[7]*p[11]*p[28] + 2*p[7]*p[14]*p[16] - 2*p[7]*p[14]*p[25] - 2*p[8]*p[12]*p[19] + 2*p[8]*p[12]*p[28] + 2*p[8]*p[13]*p[16] - 2*p[8]*p[13]*p[25] + 2*p[9]*p[12]*p[16] - 2*p[9]*p[12]*p[25] + 2*p[9]*p[13]*p[19] - 2*p[9]*p[13]*p[28] + 2*p[10]*p[11]*p[16] - 2*p[10]*p[11]*p[25] - 2*p[10]*p[14]*p[19] + 2*p[10]*p[14]*p[28]; + coeff[101] = -4*p[0]*p[7]*p[8]*p[22] + 2*p[0]*p[7]*p[8]*p[31] + 4*p[0]*p[7]*p[10]*p[17] - 2*p[0]*p[7]*p[10]*p[26] - 4*p[0]*p[8]*p[8]*p[20] + 2*p[0]*p[8]*p[8]*p[29] + 4*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[8]*p[9]*p[26] + 4*p[0]*p[9]*p[10]*p[22] - 2*p[0]*p[9]*p[10]*p[31] - 4*p[0]*p[10]*p[10]*p[20] + 2*p[0]*p[10]*p[10]*p[29] + 4*p[0]*p[11]*p[12]*p[22] - 2*p[0]*p[11]*p[12]*p[31] - 4*p[0]*p[11]*p[14]*p[17] + 2*p[0]*p[11]*p[14]*p[26] + 4*p[0]*p[12]*p[12]*p[20] - 2*p[0]*p[12]*p[12]*p[29] - 4*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[12]*p[13]*p[26] - 4*p[0]*p[13]*p[14]*p[22] + 2*p[0]*p[13]*p[14]*p[31] + 4*p[0]*p[14]*p[14]*p[20] - 2*p[0]*p[14]*p[14]*p[29] + 2*p[7]*p[11]*p[20] - 2*p[7]*p[11]*p[29] - 2*p[7]*p[12]*p[22] + 2*p[7]*p[12]*p[31] + 2*p[7]*p[14]*p[17] - 2*p[7]*p[14]*p[26] - 2*p[8]*p[11]*p[22] + 2*p[8]*p[11]*p[31] - 2*p[8]*p[12]*p[20] + 2*p[8]*p[12]*p[29] + 2*p[8]*p[13]*p[17] - 2*p[8]*p[13]*p[26] + 2*p[9]*p[12]*p[17] - 2*p[9]*p[12]*p[26] + 2*p[9]*p[13]*p[20] - 2*p[9]*p[13]*p[29] + 2*p[9]*p[14]*p[22] - 2*p[9]*p[14]*p[31] + 2*p[10]*p[11]*p[17] - 2*p[10]*p[11]*p[26] + 2*p[10]*p[13]*p[22] - 2*p[10]*p[13]*p[31] - 2*p[10]*p[14]*p[20] + 2*p[10]*p[14]*p[29]; + coeff[102] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + 2*p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + 2*p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + 2*p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + 2*p[10]*p[11]*p[24]; + coeff[103] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + 2*p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + 2*p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - 2*p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + 2*p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + 2*p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + 2*p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + 2*p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - 2*p[10]*p[14]*p[28]; + coeff[104] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + 2*p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - 2*p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + 2*p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - 2*p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - 2*p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + 2*p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + 2*p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + 2*p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + 2*p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + 2*p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + 2*p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - 2*p[10]*p[14]*p[29]; + coeff[105] = 0; + coeff[106] = 2*(-p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + p[12]*p[13]*p[15] - p[12]*p[13]*p[24])*p[0]; + coeff[107] = 2*(-p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - p[14]*p[14]*p[19] + p[14]*p[14]*p[28])*p[0]; + coeff[108] = 2*(p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - p[14]*p[14]*p[20] + p[14]*p[14]*p[29])*p[0]; + coeff[109] = 2*(p[7]*p[10]*p[15] - p[7]*p[10]*p[24] + p[8]*p[9]*p[15] - p[8]*p[9]*p[24] - p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - p[12]*p[13]*p[15] + p[12]*p[13]*p[24])*p[0]; + coeff[110] = 2*(p[7]*p[10]*p[16] - p[7]*p[10]*p[25] - p[8]*p[8]*p[19] + p[8]*p[8]*p[28] + p[8]*p[9]*p[16] - p[8]*p[9]*p[25] - p[10]*p[10]*p[19] + p[10]*p[10]*p[28] - p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + p[14]*p[14]*p[19] - p[14]*p[14]*p[28])*p[0]; + coeff[111] = 2*(-p[7]*p[8]*p[22] + p[7]*p[8]*p[31] + p[7]*p[10]*p[17] - p[7]*p[10]*p[26] - p[8]*p[8]*p[20] + p[8]*p[8]*p[29] + p[8]*p[9]*p[17] - p[8]*p[9]*p[26] + p[9]*p[10]*p[22] - p[9]*p[10]*p[31] - p[10]*p[10]*p[20] + p[10]*p[10]*p[29] + p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + p[14]*p[14]*p[20] - p[14]*p[14]*p[29])*p[0]; + coeff[112] = -p[3] + p[6] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30] - p[23] + p[32]; + coeff[113] = 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + p[12]*p[14]*p[24]; + coeff[114] = -2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + p[13]*p[14]*p[28]; + coeff[115] = -2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + p[13]*p[14]*p[29] - 2*p[22] + p[31]; + coeff[116] = (-p[7]*p[9] + p[8]*p[10] - p[11]*p[13] + p[12]*p[14])*p[15]; + coeff[117] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] + p[11]*p[12]*p[19] - p[11]*p[13]*p[16] + p[12]*p[14]*p[16] + p[13]*p[14]*p[19]; + coeff[118] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] + p[11]*p[12]*p[20] - p[11]*p[13]*p[17] - p[12]*p[12]*p[22] + p[12]*p[14]*p[17] - p[13]*p[13]*p[22] + p[13]*p[14]*p[20] + p[22]; + coeff[119] = 0; + coeff[120] = -2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24]; + coeff[121] = 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28]; + coeff[122] = 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29] + 2*p[22] - 2*p[31]; + coeff[123] = 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24]; + coeff[124] = -2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28]; + coeff[125] = -2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29] - 2*p[22] + 2*p[31]; + coeff[126] = 2*p[0]*p[7]*p[11]*p[23] + 2*p[0]*p[7]*p[12]*p[21] - 2*p[0]*p[7]*p[13]*p[18] + 2*p[0]*p[8]*p[11]*p[21] - 2*p[0]*p[8]*p[12]*p[23] + 2*p[0]*p[8]*p[14]*p[18] - 2*p[0]*p[9]*p[11]*p[18] - 2*p[0]*p[9]*p[13]*p[23] + 2*p[0]*p[9]*p[14]*p[21] + 2*p[0]*p[10]*p[12]*p[18] + 2*p[0]*p[10]*p[13]*p[21] + 2*p[0]*p[10]*p[14]*p[23] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] + p[11]*p[12]*p[21] - p[11]*p[12]*p[30] - p[11]*p[13]*p[18] + p[11]*p[13]*p[27] - p[12]*p[12]*p[23] + p[12]*p[12]*p[32] + p[12]*p[14]*p[18] - p[12]*p[14]*p[27] - p[13]*p[13]*p[23] + p[13]*p[13]*p[32] + p[13]*p[14]*p[21] - p[13]*p[14]*p[30]; + coeff[127] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - p[12]*p[14]*p[24]; + coeff[128] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - p[13]*p[14]*p[28]; + coeff[129] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - p[13]*p[14]*p[29]; + coeff[130] = (-p[7]*p[9] + p[8]*p[10] + p[11]*p[13] - p[12]*p[14])*p[15]; + coeff[131] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] - p[11]*p[12]*p[19] + p[11]*p[13]*p[16] - p[12]*p[14]*p[16] - p[13]*p[14]*p[19]; + coeff[132] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] - p[11]*p[12]*p[20] + p[11]*p[13]*p[17] + p[12]*p[12]*p[22] - p[12]*p[14]*p[17] + p[13]*p[13]*p[22] - p[13]*p[14]*p[20]; + coeff[133] = 2*(-p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32])*p[0]; + coeff[134] = 4*p[0]*p[7]*p[13]*p[15] - 2*p[0]*p[7]*p[13]*p[24] - 4*p[0]*p[8]*p[14]*p[15] + 2*p[0]*p[8]*p[14]*p[24] + 4*p[0]*p[9]*p[11]*p[15] - 2*p[0]*p[9]*p[11]*p[24] - 4*p[0]*p[10]*p[12]*p[15] + 2*p[0]*p[10]*p[12]*p[24] - 2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24]; + coeff[135] = -4*p[0]*p[7]*p[12]*p[19] + 2*p[0]*p[7]*p[12]*p[28] + 4*p[0]*p[7]*p[13]*p[16] - 2*p[0]*p[7]*p[13]*p[25] - 4*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[11]*p[28] - 4*p[0]*p[8]*p[14]*p[16] + 2*p[0]*p[8]*p[14]*p[25] + 4*p[0]*p[9]*p[11]*p[16] - 2*p[0]*p[9]*p[11]*p[25] - 4*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[9]*p[14]*p[28] - 4*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[12]*p[25] - 4*p[0]*p[10]*p[13]*p[19] + 2*p[0]*p[10]*p[13]*p[28] + 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28]; + coeff[136] = -4*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[11]*p[31] - 4*p[0]*p[7]*p[12]*p[20] + 2*p[0]*p[7]*p[12]*p[29] + 4*p[0]*p[7]*p[13]*p[17] - 2*p[0]*p[7]*p[13]*p[26] - 4*p[0]*p[8]*p[11]*p[20] + 2*p[0]*p[8]*p[11]*p[29] + 4*p[0]*p[8]*p[12]*p[22] - 2*p[0]*p[8]*p[12]*p[31] - 4*p[0]*p[8]*p[14]*p[17] + 2*p[0]*p[8]*p[14]*p[26] + 4*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[11]*p[26] + 4*p[0]*p[9]*p[13]*p[22] - 2*p[0]*p[9]*p[13]*p[31] - 4*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[9]*p[14]*p[29] - 4*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[12]*p[26] - 4*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[13]*p[29] - 4*p[0]*p[10]*p[14]*p[22] + 2*p[0]*p[10]*p[14]*p[31] + 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29]; + coeff[137] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24]; + coeff[138] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28]; + coeff[139] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29]; + coeff[140] = 0; + coeff[141] = 2*(-p[7]*p[13]*p[15] + p[7]*p[13]*p[24] + p[8]*p[14]*p[15] - p[8]*p[14]*p[24] - p[9]*p[11]*p[15] + p[9]*p[11]*p[24] + p[10]*p[12]*p[15] - p[10]*p[12]*p[24])*p[0]; + coeff[142] = 2*(p[7]*p[12]*p[19] - p[7]*p[12]*p[28] - p[7]*p[13]*p[16] + p[7]*p[13]*p[25] + p[8]*p[11]*p[19] - p[8]*p[11]*p[28] + p[8]*p[14]*p[16] - p[8]*p[14]*p[25] - p[9]*p[11]*p[16] + p[9]*p[11]*p[25] + p[9]*p[14]*p[19] - p[9]*p[14]*p[28] + p[10]*p[12]*p[16] - p[10]*p[12]*p[25] + p[10]*p[13]*p[19] - p[10]*p[13]*p[28])*p[0]; + coeff[143] = 2*(p[7]*p[11]*p[22] - p[7]*p[11]*p[31] + p[7]*p[12]*p[20] - p[7]*p[12]*p[29] - p[7]*p[13]*p[17] + p[7]*p[13]*p[26] + p[8]*p[11]*p[20] - p[8]*p[11]*p[29] - p[8]*p[12]*p[22] + p[8]*p[12]*p[31] + p[8]*p[14]*p[17] - p[8]*p[14]*p[26] - p[9]*p[11]*p[17] + p[9]*p[11]*p[26] - p[9]*p[13]*p[22] + p[9]*p[13]*p[31] + p[9]*p[14]*p[20] - p[9]*p[14]*p[29] + p[10]*p[12]*p[17] - p[10]*p[12]*p[26] + p[10]*p[13]*p[20] - p[10]*p[13]*p[29] + p[10]*p[14]*p[22] - p[10]*p[14]*p[31])*p[0]; + coeff[144] = 2*(p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - p[10]*p[12]*p[15] + p[10]*p[12]*p[24])*p[0]; + coeff[145] = 2*(-p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - p[10]*p[13]*p[19] + p[10]*p[13]*p[28])*p[0]; + coeff[146] = 2*(-p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - p[10]*p[14]*p[22] + p[10]*p[14]*p[31])*p[0]; + coeff[147] = -2*p[0]*p[7]*p[8]*p[21] + 2*p[0]*p[7]*p[9]*p[18] + 2*p[0]*p[8]*p[8]*p[23] - 2*p[0]*p[8]*p[10]*p[18] + 2*p[0]*p[9]*p[9]*p[23] - 2*p[0]*p[9]*p[10]*p[21] + 2*p[0]*p[11]*p[12]*p[21] - 2*p[0]*p[11]*p[13]*p[18] - 2*p[0]*p[12]*p[12]*p[23] + 2*p[0]*p[12]*p[14]*p[18] - 2*p[0]*p[13]*p[13]*p[23] + 2*p[0]*p[13]*p[14]*p[21] - p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32]; + coeff[148] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + p[10]*p[12]*p[24]; + coeff[149] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + p[10]*p[13]*p[28]; + coeff[150] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + p[10]*p[14]*p[31]; + coeff[151] = (-p[7]*p[13] + p[8]*p[14] - p[9]*p[11] + p[10]*p[12])*p[15]; + coeff[152] = p[7]*p[12]*p[19] - p[7]*p[13]*p[16] + p[8]*p[11]*p[19] + p[8]*p[14]*p[16] - p[9]*p[11]*p[16] + p[9]*p[14]*p[19] + p[10]*p[12]*p[16] + p[10]*p[13]*p[19]; + coeff[153] = p[7]*p[11]*p[22] + p[7]*p[12]*p[20] - p[7]*p[13]*p[17] + p[8]*p[11]*p[20] - p[8]*p[12]*p[22] + p[8]*p[14]*p[17] - p[9]*p[11]*p[17] - p[9]*p[13]*p[22] + p[9]*p[14]*p[20] + p[10]*p[12]*p[17] + p[10]*p[13]*p[20] + p[10]*p[14]*p[22]; + coeff[154] = 2*(p[7]*p[8]*p[21] - p[7]*p[8]*p[30] - p[7]*p[9]*p[18] + p[7]*p[9]*p[27] - p[8]*p[8]*p[23] + p[8]*p[8]*p[32] + p[8]*p[10]*p[18] - p[8]*p[10]*p[27] - p[9]*p[9]*p[23] + p[9]*p[9]*p[32] + p[9]*p[10]*p[21] - p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30])*p[0]; + coeff[155] = -4*p[0]*p[7]*p[9]*p[15] + 2*p[0]*p[7]*p[9]*p[24] + 4*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[8]*p[10]*p[24] + 4*p[0]*p[11]*p[13]*p[15] - 2*p[0]*p[11]*p[13]*p[24] - 4*p[0]*p[12]*p[14]*p[15] + 2*p[0]*p[12]*p[14]*p[24] - 2*p[7]*p[13]*p[15] + 2*p[7]*p[13]*p[24] + 2*p[8]*p[14]*p[15] - 2*p[8]*p[14]*p[24] - 2*p[9]*p[11]*p[15] + 2*p[9]*p[11]*p[24] + 2*p[10]*p[12]*p[15] - 2*p[10]*p[12]*p[24]; + coeff[156] = 4*p[0]*p[7]*p[8]*p[19] - 2*p[0]*p[7]*p[8]*p[28] - 4*p[0]*p[7]*p[9]*p[16] + 2*p[0]*p[7]*p[9]*p[25] + 4*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[8]*p[10]*p[25] + 4*p[0]*p[9]*p[10]*p[19] - 2*p[0]*p[9]*p[10]*p[28] - 4*p[0]*p[11]*p[12]*p[19] + 2*p[0]*p[11]*p[12]*p[28] + 4*p[0]*p[11]*p[13]*p[16] - 2*p[0]*p[11]*p[13]*p[25] - 4*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[12]*p[14]*p[25] - 4*p[0]*p[13]*p[14]*p[19] + 2*p[0]*p[13]*p[14]*p[28] + 2*p[7]*p[12]*p[19] - 2*p[7]*p[12]*p[28] - 2*p[7]*p[13]*p[16] + 2*p[7]*p[13]*p[25] + 2*p[8]*p[11]*p[19] - 2*p[8]*p[11]*p[28] + 2*p[8]*p[14]*p[16] - 2*p[8]*p[14]*p[25] - 2*p[9]*p[11]*p[16] + 2*p[9]*p[11]*p[25] + 2*p[9]*p[14]*p[19] - 2*p[9]*p[14]*p[28] + 2*p[10]*p[12]*p[16] - 2*p[10]*p[12]*p[25] + 2*p[10]*p[13]*p[19] - 2*p[10]*p[13]*p[28]; + coeff[157] = 4*p[0]*p[7]*p[8]*p[20] - 2*p[0]*p[7]*p[8]*p[29] - 4*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[7]*p[9]*p[26] - 4*p[0]*p[8]*p[8]*p[22] + 2*p[0]*p[8]*p[8]*p[31] + 4*p[0]*p[8]*p[10]*p[17] - 2*p[0]*p[8]*p[10]*p[26] - 4*p[0]*p[9]*p[9]*p[22] + 2*p[0]*p[9]*p[9]*p[31] + 4*p[0]*p[9]*p[10]*p[20] - 2*p[0]*p[9]*p[10]*p[29] - 4*p[0]*p[11]*p[12]*p[20] + 2*p[0]*p[11]*p[12]*p[29] + 4*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[11]*p[13]*p[26] + 4*p[0]*p[12]*p[12]*p[22] - 2*p[0]*p[12]*p[12]*p[31] - 4*p[0]*p[12]*p[14]*p[17] + 2*p[0]*p[12]*p[14]*p[26] + 4*p[0]*p[13]*p[13]*p[22] - 2*p[0]*p[13]*p[13]*p[31] - 4*p[0]*p[13]*p[14]*p[20] + 2*p[0]*p[13]*p[14]*p[29] + 2*p[7]*p[11]*p[22] - 2*p[7]*p[11]*p[31] + 2*p[7]*p[12]*p[20] - 2*p[7]*p[12]*p[29] - 2*p[7]*p[13]*p[17] + 2*p[7]*p[13]*p[26] + 2*p[8]*p[11]*p[20] - 2*p[8]*p[11]*p[29] - 2*p[8]*p[12]*p[22] + 2*p[8]*p[12]*p[31] + 2*p[8]*p[14]*p[17] - 2*p[8]*p[14]*p[26] - 2*p[9]*p[11]*p[17] + 2*p[9]*p[11]*p[26] - 2*p[9]*p[13]*p[22] + 2*p[9]*p[13]*p[31] + 2*p[9]*p[14]*p[20] - 2*p[9]*p[14]*p[29] + 2*p[10]*p[12]*p[17] - 2*p[10]*p[12]*p[26] + 2*p[10]*p[13]*p[20] - 2*p[10]*p[13]*p[29] + 2*p[10]*p[14]*p[22] - 2*p[10]*p[14]*p[31]; + coeff[158] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - 2*p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + 2*p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - 2*p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + 2*p[10]*p[12]*p[24]; + coeff[159] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + 2*p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - 2*p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + 2*p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + 2*p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - 2*p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + 2*p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + 2*p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + 2*p[10]*p[13]*p[28]; + coeff[160] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + 2*p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + 2*p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - 2*p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + 2*p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - 2*p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + 2*p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - 2*p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - 2*p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + 2*p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + 2*p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + 2*p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + 2*p[10]*p[14]*p[31]; + coeff[161] = 0; + coeff[162] = 2*(p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + p[12]*p[14]*p[15] - p[12]*p[14]*p[24])*p[0]; + coeff[163] = 2*(-p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + p[13]*p[14]*p[19] - p[13]*p[14]*p[28])*p[0]; + coeff[164] = 2*(-p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + p[13]*p[14]*p[20] - p[13]*p[14]*p[29])*p[0]; + coeff[165] = 2*(-p[7]*p[9]*p[15] + p[7]*p[9]*p[24] + p[8]*p[10]*p[15] - p[8]*p[10]*p[24] + p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - p[12]*p[14]*p[15] + p[12]*p[14]*p[24])*p[0]; + coeff[166] = 2*(p[7]*p[8]*p[19] - p[7]*p[8]*p[28] - p[7]*p[9]*p[16] + p[7]*p[9]*p[25] + p[8]*p[10]*p[16] - p[8]*p[10]*p[25] + p[9]*p[10]*p[19] - p[9]*p[10]*p[28] - p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - p[13]*p[14]*p[19] + p[13]*p[14]*p[28])*p[0]; + coeff[167] = 2*(p[7]*p[8]*p[20] - p[7]*p[8]*p[29] - p[7]*p[9]*p[17] + p[7]*p[9]*p[26] - p[8]*p[8]*p[22] + p[8]*p[8]*p[31] + p[8]*p[10]*p[17] - p[8]*p[10]*p[26] - p[9]*p[9]*p[22] + p[9]*p[9]*p[31] + p[9]*p[10]*p[20] - p[9]*p[10]*p[29] - p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - p[13]*p[14]*p[20] + p[13]*p[14]*p[29])*p[0]; +} + +} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/common/point_query.h b/thirdparty/embree-aarch64/kernels/common/point_query.h new file mode 100644 index 00000000000..27d158ca3ab --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/point_query.h @@ -0,0 +1,136 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /* Point query structure for closest point query */ + template + struct RTC_ALIGN(16) PointQueryK + { + /* Default construction does nothing */ + __forceinline PointQueryK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline PointQueryK(const Vec3vf& p, const vfloat& radius = inf, const vfloat& time = zero) + : p(p), time(time), radius(radius) {} + + /* Returns the size of the ray */ + static __forceinline size_t size() { return K; } + + /* Calculates if this is a valid ray that does not cause issues during traversal */ + __forceinline vbool valid() const + { + const vbool vx = (abs(p.x) <= vfloat(FLT_LARGE)); + const vbool vy = (abs(p.y) <= vfloat(FLT_LARGE)); + const vbool vz = (abs(p.z) <= vfloat(FLT_LARGE)); + const vbool vn = radius >= vfloat(0); + const vbool vf = abs(time) < vfloat(inf); + return vx & vy & vz & vn & vf; + } + + __forceinline void get(PointQueryK<1>* ray) const; + __forceinline void get(size_t i, PointQueryK<1>& ray) const; + __forceinline void set(const PointQueryK<1>* ray); + __forceinline void set(size_t i, const PointQueryK<1>& ray); + + Vec3vf p; // location of the query point + vfloat time; // time for motion blur + vfloat radius; // radius for the point query + }; + + /* Specialization for a single point query */ + template<> + struct RTC_ALIGN(16) PointQueryK<1> + { + /* Default construction does nothing */ + __forceinline PointQueryK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline PointQueryK(const Vec3fa& p, float radius = inf, float time = zero) + : p(p), time(time), radius(radius) {} + + /* Calculates if this is a valid ray that does not cause issues during traversal */ + __forceinline bool valid() const { + return all(le_mask(abs(Vec3fa(p)), Vec3fa(FLT_LARGE)) & le_mask(Vec3fa(0.f), Vec3fa(radius))) && abs(time) < float(inf); + } + + Vec3f p; + float time; + float radius; + }; + + /* Converts point query packet to single point query */ + template + __forceinline void PointQueryK::get(PointQueryK<1>* query) const + { + for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose + { + query[i].p.x = p.x[i]; + query[i].p.y = p.y[i]; + query[i].p.z = p.z[i]; + query[i].time = time[i]; + query[i].radius = radius[i]; + } + } + + /* Extracts a single point query out of a point query packet*/ + template + __forceinline void PointQueryK::get(size_t i, PointQueryK<1>& query) const + { + query.p.x = p.x[i]; + query.p.y = p.y[i]; + query.p.z = p.z[i]; + query.radius = radius[i]; + query.time = time[i]; + } + + /* Converts single point query to point query packet */ + template + __forceinline void PointQueryK::set(const PointQueryK<1>* query) + { + for (size_t i = 0; i < K; i++) + { + p.x[i] = query[i].p.x; + p.y[i] = query[i].p.y; + p.z[i] = query[i].p.z; + radius[i] = query[i].radius; + time[i] = query[i].time; + } + } + + /* inserts a single point query into a point query packet element */ + template + __forceinline void PointQueryK::set(size_t i, const PointQueryK<1>& query) + { + p.x[i] = query.p.x; + p.y[i] = query.p.y; + p.z[i] = query.p.z; + radius[i] = query.radius; + time[i] = query.time; + } + + /* Shortcuts */ + typedef PointQueryK<1> PointQuery; + typedef PointQueryK<4> PointQuery4; + typedef PointQueryK<8> PointQuery8; + typedef PointQueryK<16> PointQuery16; + struct PointQueryN; + + /* Outputs point query to stream */ + template + __forceinline embree_ostream operator <<(embree_ostream cout, const PointQueryK& query) + { + cout << "{ " << embree_endl + << " p = " << query.p << embree_endl + << " r = " << query.radius << embree_endl + << " time = " << query.time << embree_endl + << "}"; + return cout; + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/primref.h b/thirdparty/embree-aarch64/kernels/common/primref.h new file mode 100644 index 00000000000..ce75c982bbe --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/primref.h @@ -0,0 +1,138 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /*! A primitive reference stores the bounds of the primitive and its ID. */ + struct __aligned(32) PrimRef + { + __forceinline PrimRef () {} + +#if defined(__AVX__) + __forceinline PrimRef(const PrimRef& v) { + vfloat8::store((float*)this,vfloat8::load((float*)&v)); + } + __forceinline PrimRef& operator=(const PrimRef& v) { + vfloat8::store((float*)this,vfloat8::load((float*)&v)); return *this; + } +#endif + + __forceinline PrimRef (const BBox3fa& bounds, unsigned int geomID, unsigned int primID) + { + lower = Vec3fx(bounds.lower, geomID); + upper = Vec3fx(bounds.upper, primID); + } + + __forceinline PrimRef (const BBox3fa& bounds, size_t id) + { +#if defined(__X86_64__) || defined(__aarch64__) + lower = Vec3fx(bounds.lower, (unsigned)(id & 0xFFFFFFFF)); + upper = Vec3fx(bounds.upper, (unsigned)((id >> 32) & 0xFFFFFFFF)); +#else + lower = Vec3fx(bounds.lower, (unsigned)id); + upper = Vec3fx(bounds.upper, (unsigned)0); +#endif + } + + /*! calculates twice the center of the primitive */ + __forceinline const Vec3fa center2() const { + return lower+upper; + } + + /*! return the bounding box of the primitive */ + __forceinline const BBox3fa bounds() const { + return BBox3fa(lower,upper); + } + + /*! size for bin heuristic is 1 */ + __forceinline unsigned size() const { + return 1; + } + + /*! returns bounds and centroid used for binning */ + __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const + { + bounds_o = bounds(); + center_o = embree::center2(bounds_o); + } + + __forceinline unsigned& geomIDref() { // FIXME: remove !!!!!!! + return lower.u; + } + __forceinline unsigned& primIDref() { // FIXME: remove !!!!!!! + return upper.u; + } + + /*! returns the geometry ID */ + __forceinline unsigned geomID() const { + return lower.a; + } + + /*! returns the primitive ID */ + __forceinline unsigned primID() const { + return upper.a; + } + + /*! returns an size_t sized ID */ + __forceinline size_t ID() const { +#if defined(__X86_64__) || defined(__aarch64__) + return size_t(lower.u) + (size_t(upper.u) << 32); +#else + return size_t(lower.u); +#endif + } + + /*! special function for operator< */ + __forceinline uint64_t ID64() const { + return (((uint64_t)primID()) << 32) + (uint64_t)geomID(); + } + + /*! allows sorting the primrefs by ID */ + friend __forceinline bool operator<(const PrimRef& p0, const PrimRef& p1) { + return p0.ID64() < p1.ID64(); + } + + /*! Outputs primitive reference to a stream. */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRef& ref) { + return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << " }"; + } + + public: + Vec3fx lower; //!< lower bounds and geomID + Vec3fx upper; //!< upper bounds and primID + }; + + /*! fast exchange for PrimRefs */ + __forceinline void xchg(PrimRef& a, PrimRef& b) + { +#if defined(__AVX__) + const vfloat8 aa = vfloat8::load((float*)&a); + const vfloat8 bb = vfloat8::load((float*)&b); + vfloat8::store((float*)&a,bb); + vfloat8::store((float*)&b,aa); +#else + std::swap(a,b); +#endif + } + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + struct SubGridBuildData { + unsigned short sx,sy; + unsigned int primID; + + __forceinline SubGridBuildData() {}; + __forceinline SubGridBuildData(const unsigned int sx, const unsigned int sy, const unsigned int primID) : sx(sx), sy(sy), primID(primID) {}; + + __forceinline size_t x() const { return (size_t)sx & 0x7fff; } + __forceinline size_t y() const { return (size_t)sy & 0x7fff; } + + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/primref_mb.h b/thirdparty/embree-aarch64/kernels/common/primref_mb.h new file mode 100644 index 00000000000..b6c1ad57121 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/primref_mb.h @@ -0,0 +1,262 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +#define MBLUR_BIN_LBBOX 1 + +namespace embree +{ +#if MBLUR_BIN_LBBOX + + /*! A primitive reference stores the bounds of the primitive and its ID. */ + struct PrimRefMB + { + typedef LBBox3fa BBox; + + __forceinline PrimRefMB () {} + + __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID) + : lbounds((LBBox3fx)lbounds_i), time_range(time_range) + { + assert(activeTimeSegments > 0); + lbounds.bounds0.lower.a = geomID; + lbounds.bounds0.upper.a = primID; + lbounds.bounds1.lower.a = activeTimeSegments; + lbounds.bounds1.upper.a = totalTimeSegments; + } + + __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id) + : lbounds((LBBox3fx)lbounds_i), time_range(time_range) + { + assert(activeTimeSegments > 0); +#if defined(__X86_64__) || defined(__aarch64__) + lbounds.bounds0.lower.a = id & 0xFFFFFFFF; + lbounds.bounds0.upper.a = (id >> 32) & 0xFFFFFFFF; +#else + lbounds.bounds0.lower.a = id; + lbounds.bounds0.upper.a = 0; +#endif + lbounds.bounds1.lower.a = activeTimeSegments; + lbounds.bounds1.upper.a = totalTimeSegments; + } + + __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id) + : lbounds((LBBox3fx)lbounds_i), time_range(time_range) + { + assert(activeTimeSegments > 0); +#if defined(__X86_64__) || defined(__aarch64__) + lbounds.bounds0.lower.u = id & 0xFFFFFFFF; + lbounds.bounds0.upper.u = (id >> 32) & 0xFFFFFFFF; +#else + lbounds.bounds0.lower.u = id; + lbounds.bounds0.upper.u = 0; +#endif + lbounds.bounds1.lower.a = activeTimeSegments; + lbounds.bounds1.upper.a = totalTimeSegments; + } + + /*! returns bounds for binning */ + __forceinline LBBox3fa bounds() const { + return lbounds; + } + + /*! returns the number of time segments of this primref */ + __forceinline unsigned size() const { + return lbounds.bounds1.lower.a; + } + + __forceinline unsigned totalTimeSegments() const { + return lbounds.bounds1.upper.a; + } + + /* calculate overlapping time segment range */ + __forceinline range timeSegmentRange(const BBox1f& range) const { + return getTimeSegmentRange(range,time_range,float(totalTimeSegments())); + } + + /* returns time that corresponds to time step */ + __forceinline float timeStep(const int i) const { + assert(i>=0 && i<=(int)totalTimeSegments()); + return time_range.lower + time_range.size()*float(i)/float(totalTimeSegments()); + } + + /*! checks if time range overlaps */ + __forceinline bool time_range_overlap(const BBox1f& range) const + { + if (0.9999f*time_range.upper <= range.lower) return false; + if (1.0001f*time_range.lower >= range.upper) return false; + return true; + } + + /*! returns center for binning */ + __forceinline Vec3fa binCenter() const { + return center2(lbounds.interpolate(0.5f)); + } + + /*! returns bounds and centroid used for binning */ + __forceinline void binBoundsAndCenter(LBBox3fa& bounds_o, Vec3fa& center_o) const + { + bounds_o = bounds(); + center_o = binCenter(); + } + + /*! returns the geometry ID */ + __forceinline unsigned geomID() const { + return lbounds.bounds0.lower.a; + } + + /*! returns the primitive ID */ + __forceinline unsigned primID() const { + return lbounds.bounds0.upper.a; + } + + /*! returns an size_t sized ID */ + __forceinline size_t ID() const { +#if defined(__X86_64__) || defined(__aarch64__) + return size_t(lbounds.bounds0.lower.u) + (size_t(lbounds.bounds0.upper.u) << 32); +#else + return size_t(lbounds.bounds0.lower.u); +#endif + } + + /*! special function for operator< */ + __forceinline uint64_t ID64() const { + return (((uint64_t)primID()) << 32) + (uint64_t)geomID(); + } + + /*! allows sorting the primrefs by ID */ + friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) { + return p0.ID64() < p1.ID64(); + } + + /*! Outputs primitive reference to a stream. */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) { + return cout << "{ time_range = " << ref.time_range << ", bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ", total_segments = " << ref.totalTimeSegments() << " }"; + } + + public: + LBBox3fx lbounds; + BBox1f time_range; // entire geometry time range + }; + +#else + + /*! A primitive reference stores the bounds of the primitive and its ID. */ + struct __aligned(16) PrimRefMB + { + typedef BBox3fa BBox; + + __forceinline PrimRefMB () {} + + __forceinline PrimRefMB (const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID) + : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range) + { + assert(activeTimeSegments > 0); + bbox.lower.a = geomID; + bbox.upper.a = primID; + } + + __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id) + : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range) + { + assert(activeTimeSegments > 0); +#if defined(__X86_64__) || defined(__aarch64__) + bbox.lower.u = id & 0xFFFFFFFF; + bbox.upper.u = (id >> 32) & 0xFFFFFFFF; +#else + bbox.lower.u = id; + bbox.upper.u = 0; +#endif + } + + /*! returns bounds for binning */ + __forceinline BBox3fa bounds() const { + return bbox; + } + + /*! returns the number of time segments of this primref */ + __forceinline unsigned int size() const { + return _activeTimeSegments; + } + + __forceinline unsigned int totalTimeSegments() const { + return _totalTimeSegments; + } + + /* calculate overlapping time segment range */ + __forceinline range timeSegmentRange(const BBox1f& range) const { + return getTimeSegmentRange(range,time_range,float(_totalTimeSegments)); + } + + /* returns time that corresponds to time step */ + __forceinline float timeStep(const int i) const { + assert(i>=0 && i<=(int)_totalTimeSegments); + return time_range.lower + time_range.size()*float(i)/float(_totalTimeSegments); + } + + /*! checks if time range overlaps */ + __forceinline bool time_range_overlap(const BBox1f& range) const + { + if (0.9999f*time_range.upper <= range.lower) return false; + if (1.0001f*time_range.lower >= range.upper) return false; + return true; + } + + /*! returns center for binning */ + __forceinline Vec3fa binCenter() const { + return center2(bounds()); + } + + /*! returns bounds and centroid used for binning */ + __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const + { + bounds_o = bounds(); + center_o = center2(bounds()); + } + + /*! returns the geometry ID */ + __forceinline unsigned int geomID() const { + return bbox.lower.a; + } + + /*! returns the primitive ID */ + __forceinline unsigned int primID() const { + return bbox.upper.a; + } + + /*! returns an size_t sized ID */ + __forceinline size_t ID() const { +#if defined(__X86_64__) || defined(__aarch64__) + return size_t(bbox.lower.u) + (size_t(bbox.upper.u) << 32); +#else + return size_t(bbox.lower.u); +#endif + } + + /*! special function for operator< */ + __forceinline uint64_t ID64() const { + return (((uint64_t)primID()) << 32) + (uint64_t)geomID(); + } + + /*! allows sorting the primrefs by ID */ + friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) { + return p0.ID64() < p1.ID64(); + } + + /*! Outputs primitive reference to a stream. */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) { + return cout << "{ bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ", total_segments = " << ref.totalTimeSegments() << " }"; + } + + public: + BBox3fa bbox; // bounds, geomID, primID + unsigned int _activeTimeSegments; + unsigned int _totalTimeSegments; + BBox1f time_range; // entire geometry time range + }; + +#endif +} diff --git a/thirdparty/embree-aarch64/kernels/common/profile.h b/thirdparty/embree-aarch64/kernels/common/profile.h new file mode 100644 index 00000000000..a7de36414de --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/profile.h @@ -0,0 +1,159 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /*! helper structure for the implementation of the profile functions below */ + struct ProfileTimer + { + static const size_t N = 20; + + ProfileTimer () {} + + ProfileTimer (const size_t numSkip) : i(0), j(0), maxJ(0), numSkip(numSkip), t0(0) + { + for (size_t i=0; i=numSkip) { + dt_min[j] = min(dt_min[j],dt); + dt_avg[j] = dt_avg[j] + dt; + dt_max[j] = max(dt_max[j],dt); + } + j++; + maxJ = max(maxJ,j); + } + + __forceinline void relative (const char* name) + { + const double t1 = getSeconds(); + const double dt = t1-tj; + tj = t1; + assert(names[j] == nullptr || names[j] == name); + names[j] = name; + if (i == 0) dt_fst[j] = dt; + if (i>=numSkip) { + dt_min[j] = min(dt_min[j],dt); + dt_avg[j] = dt_avg[j] + dt; + dt_max[j] = max(dt_max[j],dt); + } + j++; + maxJ = max(maxJ,j); + } + + void print(size_t numElements) + { + for (size_t k=0; k + void profile(const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) + { + ProfileTimer timer(numSkip); + + for (size_t i=0; i + void profile(ProfileTimer& timer, const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) + { + timer = ProfileTimer(numSkip); + + for (size_t i=0; i + struct RayK + { + /* Default construction does nothing */ + __forceinline RayK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline RayK(const Vec3vf& org, const Vec3vf& dir, + const vfloat& tnear = zero, const vfloat& tfar = inf, + const vfloat& time = zero, const vint& mask = -1, const vint& id = 0, const vint& flags = 0) + : org(org), dir(dir), _tnear(tnear), tfar(tfar), _time(time), mask(mask), id(id), flags(flags) {} + + /* Returns the size of the ray */ + static __forceinline size_t size() { return K; } + + /* Calculates if this is a valid ray that does not cause issues during traversal */ + __forceinline vbool valid() const + { + const vbool vx = (abs(org.x) <= vfloat(FLT_LARGE)) & (abs(dir.x) <= vfloat(FLT_LARGE)); + const vbool vy = (abs(org.y) <= vfloat(FLT_LARGE)) & (abs(dir.y) <= vfloat(FLT_LARGE)); + const vbool vz = (abs(org.z) <= vfloat(FLT_LARGE)) & (abs(dir.z) <= vfloat(FLT_LARGE)); + const vbool vn = abs(tnear()) <= vfloat(inf); + const vbool vf = abs(tfar) <= vfloat(inf); + return vx & vy & vz & vn & vf; + } + + __forceinline void get(RayK<1>* ray) const; + __forceinline void get(size_t i, RayK<1>& ray) const; + __forceinline void set(const RayK<1>* ray); + __forceinline void set(size_t i, const RayK<1>& ray); + + __forceinline void copy(size_t dest, size_t source); + + __forceinline vint octant() const + { + return select(dir.x < 0.0f, vint(1), vint(zero)) | + select(dir.y < 0.0f, vint(2), vint(zero)) | + select(dir.z < 0.0f, vint(4), vint(zero)); + } + + /* Ray data */ + Vec3vf org; // ray origin + vfloat _tnear; // start of ray segment + Vec3vf dir; // ray direction + vfloat _time; // time of this ray for motion blur + vfloat tfar; // end of ray segment + vint mask; // used to mask out objects during traversal + vint id; + vint flags; + + __forceinline vfloat& tnear() { return _tnear; } + __forceinline vfloat& time() { return _time; } + __forceinline const vfloat& tnear() const { return _tnear; } + __forceinline const vfloat& time() const { return _time; } + }; + + /* Ray+hit structure for K rays */ + template + struct RayHitK : RayK + { + using RayK::org; + using RayK::_tnear; + using RayK::dir; + using RayK::_time; + using RayK::tfar; + using RayK::mask; + using RayK::id; + using RayK::flags; + + using RayK::tnear; + using RayK::time; + + /* Default construction does nothing */ + __forceinline RayHitK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline RayHitK(const Vec3vf& org, const Vec3vf& dir, + const vfloat& tnear = zero, const vfloat& tfar = inf, + const vfloat& time = zero, const vint& mask = -1, const vint& id = 0, const vint& flags = 0) + : RayK(org, dir, tnear, tfar, time, mask, id, flags), + geomID(RTC_INVALID_GEOMETRY_ID) + { + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = RTC_INVALID_GEOMETRY_ID; + } + + __forceinline RayHitK(const RayK& ray) + : RayK(ray), + geomID(RTC_INVALID_GEOMETRY_ID) + { + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = RTC_INVALID_GEOMETRY_ID; + } + + __forceinline RayHitK& operator =(const RayK& ray) + { + org = ray.org; + _tnear = ray._tnear; + dir = ray.dir; + _time = ray._time; + tfar = ray.tfar; + mask = ray.mask; + id = ray.id; + flags = ray.flags; + + geomID = RTC_INVALID_GEOMETRY_ID; + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = RTC_INVALID_GEOMETRY_ID; + + return *this; + } + + /* Calculates if the hit is valid */ + __forceinline void verifyHit(const vbool& valid0) const + { + vbool valid = valid0 & geomID != vuint(RTC_INVALID_GEOMETRY_ID); + const vbool vt = (abs(tfar) <= vfloat(FLT_LARGE)) | (tfar == vfloat(neg_inf)); + const vbool vu = (abs(u) <= vfloat(FLT_LARGE)); + const vbool vv = (abs(u) <= vfloat(FLT_LARGE)); + const vbool vnx = abs(Ng.x) <= vfloat(FLT_LARGE); + const vbool vny = abs(Ng.y) <= vfloat(FLT_LARGE); + const vbool vnz = abs(Ng.z) <= vfloat(FLT_LARGE); + if (any(valid & !vt)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid t"); + if (any(valid & !vu)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid u"); + if (any(valid & !vv)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid v"); + if (any(valid & !vnx)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.x"); + if (any(valid & !vny)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.y"); + if (any(valid & !vnz)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.z"); + } + + __forceinline void get(RayHitK<1>* ray) const; + __forceinline void get(size_t i, RayHitK<1>& ray) const; + __forceinline void set(const RayHitK<1>* ray); + __forceinline void set(size_t i, const RayHitK<1>& ray); + + __forceinline void copy(size_t dest, size_t source); + + /* Hit data */ + Vec3vf Ng; // geometry normal + vfloat u; // barycentric u coordinate of hit + vfloat v; // barycentric v coordinate of hit + vuint primID; // primitive ID + vuint geomID; // geometry ID + vuint instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID + }; + + /* Specialization for a single ray */ + template<> + struct RayK<1> + { + /* Default construction does nothing */ + __forceinline RayK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline RayK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0) + : org(org,tnear), dir(dir,time), tfar(tfar), mask(mask), id(id), flags(flags) {} + + /* Calculates if this is a valid ray that does not cause issues during traversal */ + __forceinline bool valid() const { + return all(le_mask(abs(Vec3fa(org)), Vec3fa(FLT_LARGE)) & le_mask(abs(Vec3fa(dir)), Vec3fa(FLT_LARGE))) && abs(tnear()) <= float(inf) && abs(tfar) <= float(inf); + } + + /* Ray data */ + Vec3ff org; // 3 floats for ray origin, 1 float for tnear + //float tnear; // start of ray segment + Vec3ff dir; // 3 floats for ray direction, 1 float for time + // float time; + float tfar; // end of ray segment + int mask; // used to mask out objects during traversal + int id; // ray ID + int flags; // ray flags + + __forceinline float& tnear() { return org.w; }; + __forceinline const float& tnear() const { return org.w; }; + + __forceinline float& time() { return dir.w; }; + __forceinline const float& time() const { return dir.w; }; + + }; + + template<> + struct RayHitK<1> : RayK<1> + { + /* Default construction does nothing */ + __forceinline RayHitK() {} + + /* Constructs a ray from origin, direction, and ray segment. Near + * has to be smaller than far */ + __forceinline RayHitK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0) + : RayK<1>(org, dir, tnear, tfar, time, mask, id, flags), + geomID(RTC_INVALID_GEOMETRY_ID) {} + + __forceinline RayHitK(const RayK<1>& ray) + : RayK<1>(ray), + geomID(RTC_INVALID_GEOMETRY_ID) {} + + __forceinline RayHitK<1>& operator =(const RayK<1>& ray) + { + org = ray.org; + dir = ray.dir; + tfar = ray.tfar; + mask = ray.mask; + id = ray.id; + flags = ray.flags; + + geomID = RTC_INVALID_GEOMETRY_ID; + + return *this; + } + + /* Calculates if the hit is valid */ + __forceinline void verifyHit() const + { + if (geomID == RTC_INVALID_GEOMETRY_ID) return; + const bool vt = (abs(tfar) <= FLT_LARGE) || (tfar == float(neg_inf)); + const bool vu = (abs(u) <= FLT_LARGE); + const bool vv = (abs(u) <= FLT_LARGE); + const bool vnx = abs(Ng.x) <= FLT_LARGE; + const bool vny = abs(Ng.y) <= FLT_LARGE; + const bool vnz = abs(Ng.z) <= FLT_LARGE; + if (!vt) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid t"); + if (!vu) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid u"); + if (!vv) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid v"); + if (!vnx) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.x"); + if (!vny) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.y"); + if (!vnz) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.z"); + } + + /* Hit data */ + Vec3f Ng; // not normalized geometry normal + float u; // barycentric u coordinate of hit + float v; // barycentric v coordinate of hit + unsigned int primID; // primitive ID + unsigned int geomID; // geometry ID + unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID + }; + + /* Converts ray packet to single rays */ + template + __forceinline void RayK::get(RayK<1>* ray) const + { + for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose + { + ray[i].org.x = org.x[i]; ray[i].org.y = org.y[i]; ray[i].org.z = org.z[i]; ray[i].tnear() = tnear()[i]; + ray[i].dir.x = dir.x[i]; ray[i].dir.y = dir.y[i]; ray[i].dir.z = dir.z[i]; ray[i].time() = time()[i]; + ray[i].tfar = tfar[i]; ray[i].mask = mask[i]; ray[i].id = id[i]; ray[i].flags = flags[i]; + } + } + + template + __forceinline void RayHitK::get(RayHitK<1>* ray) const + { + // FIXME: use SIMD transpose + for (size_t i = 0; i < K; i++) + get(i, ray[i]); + } + + /* Extracts a single ray out of a ray packet*/ + template + __forceinline void RayK::get(size_t i, RayK<1>& ray) const + { + ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i]; + ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.time() = time()[i]; + ray.tfar = tfar[i]; ray.mask = mask[i]; ray.id = id[i]; ray.flags = flags[i]; + } + + template + __forceinline void RayHitK::get(size_t i, RayHitK<1>& ray) const + { + ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i]; + ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.tfar = tfar[i]; ray.time() = time()[i]; + ray.mask = mask[i]; ray.id = id[i]; ray.flags = flags[i]; + ray.Ng.x = Ng.x[i]; ray.Ng.y = Ng.y[i]; ray.Ng.z = Ng.z[i]; + ray.u = u[i]; ray.v = v[i]; + ray.primID = primID[i]; ray.geomID = geomID[i]; + + instance_id_stack::copy(instID, ray.instID, i); + } + + /* Converts single rays to ray packet */ + template + __forceinline void RayK::set(const RayK<1>* ray) + { + // FIXME: use SIMD transpose + for (size_t i = 0; i < K; i++) + set(i, ray[i]); + } + + template + __forceinline void RayHitK::set(const RayHitK<1>* ray) + { + // FIXME: use SIMD transpose + for (size_t i = 0; i < K; i++) + set(i, ray[i]); + } + + /* inserts a single ray into a ray packet element */ + template + __forceinline void RayK::set(size_t i, const RayK<1>& ray) + { + org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear(); + dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time(); + tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags; + } + + template + __forceinline void RayHitK::set(size_t i, const RayHitK<1>& ray) + { + org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear(); + dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time(); + tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags; + Ng.x[i] = ray.Ng.x; Ng.y[i] = ray.Ng.y; Ng.z[i] = ray.Ng.z; + u[i] = ray.u; v[i] = ray.v; + primID[i] = ray.primID; geomID[i] = ray.geomID; + + instance_id_stack::copy(ray.instID, instID, i); + } + + /* copies a ray packet element into another element*/ + template + __forceinline void RayK::copy(size_t dest, size_t source) + { + org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source]; + dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; + tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source]; + } + + template + __forceinline void RayHitK::copy(size_t dest, size_t source) + { + org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source]; + dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; + tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source]; + Ng.x[dest] = Ng.x[source]; Ng.y[dest] = Ng.y[source]; Ng.z[dest] = Ng.z[source]; + u[dest] = u[source]; v[dest] = v[source]; + primID[dest] = primID[source]; geomID[dest] = geomID[source]; + + instance_id_stack::copy(instID, instID, source, dest); + } + + /* Shortcuts */ + typedef RayK<1> Ray; + typedef RayK<4> Ray4; + typedef RayK<8> Ray8; + typedef RayK<16> Ray16; + struct RayN; + + typedef RayHitK<1> RayHit; + typedef RayHitK<4> RayHit4; + typedef RayHitK<8> RayHit8; + typedef RayHitK<16> RayHit16; + struct RayHitN; + + template + struct RayTypeHelper; + + template + struct RayTypeHelper + { + typedef RayHitK Ty; + }; + + template + struct RayTypeHelper + { + typedef RayK Ty; + }; + + template + using RayType = typename RayTypeHelper<1, intersect>::Ty; + + template + using RayTypeK = typename RayTypeHelper::Ty; + + /* Outputs ray to stream */ + template + __forceinline embree_ostream operator <<(embree_ostream cout, const RayK& ray) + { + return cout << "{ " << embree_endl + << " org = " << ray.org << embree_endl + << " dir = " << ray.dir << embree_endl + << " near = " << ray.tnear() << embree_endl + << " far = " << ray.tfar << embree_endl + << " time = " << ray.time() << embree_endl + << " mask = " << ray.mask << embree_endl + << " id = " << ray.id << embree_endl + << " flags = " << ray.flags << embree_endl + << "}"; + } + + template + __forceinline embree_ostream operator <<(embree_ostream cout, const RayHitK& ray) + { + cout << "{ " << embree_endl + << " org = " << ray.org << embree_endl + << " dir = " << ray.dir << embree_endl + << " near = " << ray.tnear() << embree_endl + << " far = " << ray.tfar << embree_endl + << " time = " << ray.time() << embree_endl + << " mask = " << ray.mask << embree_endl + << " id = " << ray.id << embree_endl + << " flags = " << ray.flags << embree_endl + << " Ng = " << ray.Ng + << " u = " << ray.u << embree_endl + << " v = " << ray.v << embree_endl + << " primID = " << ray.primID << embree_endl + << " geomID = " << ray.geomID << embree_endl + << " instID ="; + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + { + cout << " " << ray.instID[l]; + } + cout << embree_endl; + return cout << "}"; + } + + struct RayStreamSOA + { + __forceinline RayStreamSOA(void* rays, size_t N) + : ptr((char*)rays), N(N) {} + + /* ray data access functions */ + __forceinline float* org_x(size_t offset = 0) { return (float*)&ptr[0*4*N+offset]; } // x coordinate of ray origin + __forceinline float* org_y(size_t offset = 0) { return (float*)&ptr[1*4*N+offset]; } // y coordinate of ray origin + __forceinline float* org_z(size_t offset = 0) { return (float*)&ptr[2*4*N+offset]; }; // z coordinate of ray origin + __forceinline float* tnear(size_t offset = 0) { return (float*)&ptr[3*4*N+offset]; }; // start of ray segment + + __forceinline float* dir_x(size_t offset = 0) { return (float*)&ptr[4*4*N+offset]; }; // x coordinate of ray direction + __forceinline float* dir_y(size_t offset = 0) { return (float*)&ptr[5*4*N+offset]; }; // y coordinate of ray direction + __forceinline float* dir_z(size_t offset = 0) { return (float*)&ptr[6*4*N+offset]; }; // z coordinate of ray direction + __forceinline float* time (size_t offset = 0) { return (float*)&ptr[7*4*N+offset]; }; // time of this ray for motion blur + + __forceinline float* tfar (size_t offset = 0) { return (float*)&ptr[8*4*N+offset]; }; // end of ray segment (set to hit distance) + __forceinline int* mask (size_t offset = 0) { return (int*)&ptr[9*4*N+offset]; }; // used to mask out objects during traversal (optional) + __forceinline int* id (size_t offset = 0) { return (int*)&ptr[10*4*N+offset]; }; // id + __forceinline int* flags(size_t offset = 0) { return (int*)&ptr[11*4*N+offset]; }; // flags + + /* hit data access functions */ + __forceinline float* Ng_x(size_t offset = 0) { return (float*)&ptr[12*4*N+offset]; }; // x coordinate of geometry normal + __forceinline float* Ng_y(size_t offset = 0) { return (float*)&ptr[13*4*N+offset]; }; // y coordinate of geometry normal + __forceinline float* Ng_z(size_t offset = 0) { return (float*)&ptr[14*4*N+offset]; }; // z coordinate of geometry normal + + __forceinline float* u(size_t offset = 0) { return (float*)&ptr[15*4*N+offset]; }; // barycentric u coordinate of hit + __forceinline float* v(size_t offset = 0) { return (float*)&ptr[16*4*N+offset]; }; // barycentric v coordinate of hit + + __forceinline unsigned int* primID(size_t offset = 0) { return (unsigned int*)&ptr[17*4*N+offset]; }; // primitive ID + __forceinline unsigned int* geomID(size_t offset = 0) { return (unsigned int*)&ptr[18*4*N+offset]; }; // geometry ID + __forceinline unsigned int* instID(size_t level, size_t offset = 0) { return (unsigned int*)&ptr[19*4*N+level*4*N+offset]; }; // instance ID + + __forceinline Ray getRayByOffset(size_t offset) + { + Ray ray; + ray.org.x = org_x(offset)[0]; + ray.org.y = org_y(offset)[0]; + ray.org.z = org_z(offset)[0]; + ray.tnear() = tnear(offset)[0]; + ray.dir.x = dir_x(offset)[0]; + ray.dir.y = dir_y(offset)[0]; + ray.dir.z = dir_z(offset)[0]; + ray.time() = time(offset)[0]; + ray.tfar = tfar(offset)[0]; + ray.mask = mask(offset)[0]; + ray.id = id(offset)[0]; + ray.flags = flags(offset)[0]; + return ray; + } + + template + __forceinline RayK getRayByOffset(size_t offset) + { + RayK ray; + ray.org.x = vfloat::loadu(org_x(offset)); + ray.org.y = vfloat::loadu(org_y(offset)); + ray.org.z = vfloat::loadu(org_z(offset)); + ray.tnear = vfloat::loadu(tnear(offset)); + ray.dir.x = vfloat::loadu(dir_x(offset)); + ray.dir.y = vfloat::loadu(dir_y(offset)); + ray.dir.z = vfloat::loadu(dir_z(offset)); + ray.time = vfloat::loadu(time(offset)); + ray.tfar = vfloat::loadu(tfar(offset)); + ray.mask = vint::loadu(mask(offset)); + ray.id = vint::loadu(id(offset)); + ray.flags = vint::loadu(flags(offset)); + return ray; + } + + template + __forceinline RayK getRayByOffset(const vbool& valid, size_t offset) + { + RayK ray; + ray.org.x = vfloat::loadu(valid, org_x(offset)); + ray.org.y = vfloat::loadu(valid, org_y(offset)); + ray.org.z = vfloat::loadu(valid, org_z(offset)); + ray.tnear() = vfloat::loadu(valid, tnear(offset)); + ray.dir.x = vfloat::loadu(valid, dir_x(offset)); + ray.dir.y = vfloat::loadu(valid, dir_y(offset)); + ray.dir.z = vfloat::loadu(valid, dir_z(offset)); + ray.time() = vfloat::loadu(valid, time(offset)); + ray.tfar = vfloat::loadu(valid, tfar(offset)); + +#if !defined(__AVX__) + /* SSE: some ray members must be loaded with scalar instructions to ensure that we don't cause memory faults, + because the SSE masked loads always access the entire vector */ + if (unlikely(!all(valid))) + { + ray.mask = zero; + ray.id = zero; + ray.flags = zero; + + for (size_t k = 0; k < K; k++) + { + if (likely(valid[k])) + { + ray.mask[k] = mask(offset)[k]; + ray.id[k] = id(offset)[k]; + ray.flags[k] = flags(offset)[k]; + } + } + } + else +#endif + { + ray.mask = vint::loadu(valid, mask(offset)); + ray.id = vint::loadu(valid, id(offset)); + ray.flags = vint::loadu(valid, flags(offset)); + } + + return ray; + } + + template + __forceinline void setHitByOffset(const vbool& valid_i, size_t offset, const RayHitK& ray) + { + /* + * valid_i: stores which of the input rays exist (do not access nonexistent rays!) + * valid: stores which of the rays actually hit something. + */ + vbool valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { + vfloat::storeu(valid, tfar(offset), ray.tfar); + vfloat::storeu(valid, Ng_x(offset), ray.Ng.x); + vfloat::storeu(valid, Ng_y(offset), ray.Ng.y); + vfloat::storeu(valid, Ng_z(offset), ray.Ng.z); + vfloat::storeu(valid, u(offset), ray.u); + vfloat::storeu(valid, v(offset), ray.v); + +#if !defined(__AVX__) + /* SSE: some ray members must be stored with scalar instructions to ensure that we don't cause memory faults, + because the SSE masked stores always access the entire vector */ + if (unlikely(!all(valid_i))) + { + for (size_t k = 0; k < K; k++) + { + if (likely(valid[k])) + { + primID(offset)[k] = ray.primID[k]; + geomID(offset)[k] = ray.geomID[k]; + + instID(0, offset)[k] = ray.instID[0][k]; +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) + instID(l, offset)[k] = ray.instID[l][k]; +#endif + } + } + } + else +#endif + { + vuint::storeu(valid, primID(offset), ray.primID); + vuint::storeu(valid, geomID(offset), ray.geomID); + + vuint::storeu(valid, instID(0, offset), ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint::storeu(valid, instID(l, offset), ray.instID[l]); +#endif + } + } + } + + template + __forceinline void setHitByOffset(const vbool& valid_i, size_t offset, const RayK& ray) + { + vbool valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + vfloat::storeu(valid, tfar(offset), ray.tfar); + } + + __forceinline size_t getOctantByOffset(size_t offset) + { + const float dx = dir_x(offset)[0]; + const float dy = dir_y(offset)[0]; + const float dz = dir_z(offset)[0]; + const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0); + return octantID; + } + + __forceinline bool isValidByOffset(size_t offset) + { + const float nnear = tnear(offset)[0]; + const float ffar = tfar(offset)[0]; + return nnear <= ffar; + } + + template + __forceinline RayK getRayByOffset(const vbool& valid, const vint& offset) + { + RayK ray; + +#if defined(__AVX2__) + ray.org.x = vfloat::template gather<1>(valid, org_x(), offset); + ray.org.y = vfloat::template gather<1>(valid, org_y(), offset); + ray.org.z = vfloat::template gather<1>(valid, org_z(), offset); + ray.tnear() = vfloat::template gather<1>(valid, tnear(), offset); + ray.dir.x = vfloat::template gather<1>(valid, dir_x(), offset); + ray.dir.y = vfloat::template gather<1>(valid, dir_y(), offset); + ray.dir.z = vfloat::template gather<1>(valid, dir_z(), offset); + ray.time() = vfloat::template gather<1>(valid, time(), offset); + ray.tfar = vfloat::template gather<1>(valid, tfar(), offset); + ray.mask = vint::template gather<1>(valid, mask(), offset); + ray.id = vint::template gather<1>(valid, id(), offset); + ray.flags = vint::template gather<1>(valid, flags(), offset); +#else + ray.org = zero; + ray.tnear() = zero; + ray.dir = zero; + ray.time() = zero; + ray.tfar = zero; + ray.mask = zero; + ray.id = zero; + ray.flags = zero; + + for (size_t k = 0; k < K; k++) + { + if (likely(valid[k])) + { + const size_t ofs = offset[k]; + + ray.org.x[k] = *org_x(ofs); + ray.org.y[k] = *org_y(ofs); + ray.org.z[k] = *org_z(ofs); + ray.tnear()[k] = *tnear(ofs); + ray.dir.x[k] = *dir_x(ofs); + ray.dir.y[k] = *dir_y(ofs); + ray.dir.z[k] = *dir_z(ofs); + ray.time()[k] = *time(ofs); + ray.tfar[k] = *tfar(ofs); + ray.mask[k] = *mask(ofs); + ray.id[k] = *id(ofs); + ray.flags[k] = *flags(ofs); + } + } +#endif + + return ray; + } + + template + __forceinline void setHitByOffset(const vbool& valid_i, const vint& offset, const RayHitK& ray) + { + vbool valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat::template scatter<1>(valid, tfar(), offset, ray.tfar); + vfloat::template scatter<1>(valid, Ng_x(), offset, ray.Ng.x); + vfloat::template scatter<1>(valid, Ng_y(), offset, ray.Ng.y); + vfloat::template scatter<1>(valid, Ng_z(), offset, ray.Ng.z); + vfloat::template scatter<1>(valid, u(), offset, ray.u); + vfloat::template scatter<1>(valid, v(), offset, ray.v); + vuint::template scatter<1>(valid, primID(), offset, ray.primID); + vuint::template scatter<1>(valid, geomID(), offset, ray.geomID); + + vuint::template scatter<1>(valid, instID(0), offset, ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint::template scatter<1>(valid, instID(l), offset, ray.instID[l]); +#endif +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + const size_t ofs = offset[k]; + + *tfar(ofs) = ray.tfar[k]; + + *Ng_x(ofs) = ray.Ng.x[k]; + *Ng_y(ofs) = ray.Ng.y[k]; + *Ng_z(ofs) = ray.Ng.z[k]; + *u(ofs) = ray.u[k]; + *v(ofs) = ray.v[k]; + *primID(ofs) = ray.primID[k]; + *geomID(ofs) = ray.geomID[k]; + + *instID(0, ofs) = ray.instID[0][k]; +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) + *instID(l, ofs) = ray.instID[l][k]; +#endif + } +#endif + } + } + + template + __forceinline void setHitByOffset(const vbool& valid_i, const vint& offset, const RayK& ray) + { + vbool valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat::template scatter<1>(valid, tfar(), offset, ray.tfar); +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + const size_t ofs = offset[k]; + + *tfar(ofs) = ray.tfar[k]; + } +#endif + } + } + + char* __restrict__ ptr; + size_t N; + }; + + template + struct StackRayStreamSOA : public RayStreamSOA + { + __forceinline StackRayStreamSOA(size_t K) + : RayStreamSOA(data, K) { assert(K <= MAX_K); } + + char data[MAX_K / 4 * sizeof(RayHit4)]; + }; + + + struct RayStreamSOP + { + template + __forceinline void init(T& t) + { + org_x = (float*)&t.org.x; + org_y = (float*)&t.org.y; + org_z = (float*)&t.org.z; + tnear = (float*)&t.tnear; + dir_x = (float*)&t.dir.x; + dir_y = (float*)&t.dir.y; + dir_z = (float*)&t.dir.z; + time = (float*)&t.time; + tfar = (float*)&t.tfar; + mask = (unsigned int*)&t.mask; + id = (unsigned int*)&t.id; + flags = (unsigned int*)&t.flags; + + Ng_x = (float*)&t.Ng.x; + Ng_y = (float*)&t.Ng.y; + Ng_z = (float*)&t.Ng.z; + u = (float*)&t.u; + v = (float*)&t.v; + primID = (unsigned int*)&t.primID; + geomID = (unsigned int*)&t.geomID; + + for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l) + instID[l] = (unsigned int*)&t.instID[l]; + } + + __forceinline Ray getRayByOffset(size_t offset) + { + Ray ray; + ray.org.x = *(float* __restrict__)((char*)org_x + offset); + ray.org.y = *(float* __restrict__)((char*)org_y + offset); + ray.org.z = *(float* __restrict__)((char*)org_z + offset); + ray.dir.x = *(float* __restrict__)((char*)dir_x + offset); + ray.dir.y = *(float* __restrict__)((char*)dir_y + offset); + ray.dir.z = *(float* __restrict__)((char*)dir_z + offset); + ray.tfar = *(float* __restrict__)((char*)tfar + offset); + ray.tnear() = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f; + ray.time() = time ? *(float* __restrict__)((char*)time + offset) : 0.0f; + ray.mask = mask ? *(unsigned int* __restrict__)((char*)mask + offset) : -1; + ray.id = id ? *(unsigned int* __restrict__)((char*)id + offset) : -1; + ray.flags = flags ? *(unsigned int* __restrict__)((char*)flags + offset) : -1; + return ray; + } + + template + __forceinline RayK getRayByOffset(const vbool& valid, size_t offset) + { + RayK ray; + ray.org.x = vfloat::loadu(valid, (float* __restrict__)((char*)org_x + offset)); + ray.org.y = vfloat::loadu(valid, (float* __restrict__)((char*)org_y + offset)); + ray.org.z = vfloat::loadu(valid, (float* __restrict__)((char*)org_z + offset)); + ray.dir.x = vfloat::loadu(valid, (float* __restrict__)((char*)dir_x + offset)); + ray.dir.y = vfloat::loadu(valid, (float* __restrict__)((char*)dir_y + offset)); + ray.dir.z = vfloat::loadu(valid, (float* __restrict__)((char*)dir_z + offset)); + ray.tfar = vfloat::loadu(valid, (float* __restrict__)((char*)tfar + offset)); + ray.tnear() = tnear ? vfloat::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f; + ray.time() = time ? vfloat::loadu(valid, (float* __restrict__)((char*)time + offset)) : 0.0f; + ray.mask = mask ? vint::loadu(valid, (const void* __restrict__)((char*)mask + offset)) : -1; + ray.id = id ? vint::loadu(valid, (const void* __restrict__)((char*)id + offset)) : -1; + ray.flags = flags ? vint::loadu(valid, (const void* __restrict__)((char*)flags + offset)) : -1; + return ray; + } + + template + __forceinline Vec3vf getDirByOffset(const vbool& valid, size_t offset) + { + Vec3vf dir; + dir.x = vfloat::loadu(valid, (float* __restrict__)((char*)dir_x + offset)); + dir.y = vfloat::loadu(valid, (float* __restrict__)((char*)dir_y + offset)); + dir.z = vfloat::loadu(valid, (float* __restrict__)((char*)dir_z + offset)); + return dir; + } + + __forceinline void setHitByOffset(size_t offset, const RayHit& ray) + { + if (ray.geomID != RTC_INVALID_GEOMETRY_ID) + { + *(float* __restrict__)((char*)tfar + offset) = ray.tfar; + + if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + offset) = ray.Ng.x; + if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + offset) = ray.Ng.y; + if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + offset) = ray.Ng.z; + *(float* __restrict__)((char*)u + offset) = ray.u; + *(float* __restrict__)((char*)v + offset) = ray.v; + *(unsigned int* __restrict__)((char*)geomID + offset) = ray.geomID; + *(unsigned int* __restrict__)((char*)primID + offset) = ray.primID; + + if (likely(instID[0])) { + *(unsigned int* __restrict__)((char*)instID[0] + offset) = ray.instID[0]; +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID; ++l) + *(unsigned int* __restrict__)((char*)instID[l] + offset) = ray.instID[l]; +#endif + } + } + } + + __forceinline void setHitByOffset(size_t offset, const Ray& ray) + { + *(float* __restrict__)((char*)tfar + offset) = ray.tfar; + } + + template + __forceinline void setHitByOffset(const vbool& valid_i, size_t offset, const RayHitK& ray) + { + vbool valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { + vfloat::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar); + + if (likely(Ng_x)) vfloat::storeu(valid, (float* __restrict__)((char*)Ng_x + offset), ray.Ng.x); + if (likely(Ng_y)) vfloat::storeu(valid, (float* __restrict__)((char*)Ng_y + offset), ray.Ng.y); + if (likely(Ng_z)) vfloat::storeu(valid, (float* __restrict__)((char*)Ng_z + offset), ray.Ng.z); + vfloat::storeu(valid, (float* __restrict__)((char*)u + offset), ray.u); + vfloat::storeu(valid, (float* __restrict__)((char*)v + offset), ray.v); + vuint::storeu(valid, (unsigned int* __restrict__)((char*)primID + offset), ray.primID); + vuint::storeu(valid, (unsigned int* __restrict__)((char*)geomID + offset), ray.geomID); + + if (likely(instID[0])) { + vuint::storeu(valid, (unsigned int* __restrict__)((char*)instID[0] + offset), ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint::storeu(valid, (unsigned int* __restrict__)((char*)instID[l] + offset), ray.instID[l]); +#endif + } + } + } + + template + __forceinline void setHitByOffset(const vbool& valid_i, size_t offset, const RayK& ray) + { + vbool valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + vfloat::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar); + } + + __forceinline size_t getOctantByOffset(size_t offset) + { + const float dx = *(float* __restrict__)((char*)dir_x + offset); + const float dy = *(float* __restrict__)((char*)dir_y + offset); + const float dz = *(float* __restrict__)((char*)dir_z + offset); + const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0); + return octantID; + } + + __forceinline bool isValidByOffset(size_t offset) + { + const float nnear = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f; + const float ffar = *(float* __restrict__)((char*)tfar + offset); + return nnear <= ffar; + } + + template + __forceinline vbool isValidByOffset(const vbool& valid, size_t offset) + { + const vfloat nnear = tnear ? vfloat::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f; + const vfloat ffar = vfloat::loadu(valid, (float* __restrict__)((char*)tfar + offset)); + return nnear <= ffar; + } + + template + __forceinline RayK getRayByOffset(const vbool& valid, const vint& offset) + { + RayK ray; + +#if defined(__AVX2__) + ray.org.x = vfloat::template gather<1>(valid, org_x, offset); + ray.org.y = vfloat::template gather<1>(valid, org_y, offset); + ray.org.z = vfloat::template gather<1>(valid, org_z, offset); + ray.dir.x = vfloat::template gather<1>(valid, dir_x, offset); + ray.dir.y = vfloat::template gather<1>(valid, dir_y, offset); + ray.dir.z = vfloat::template gather<1>(valid, dir_z, offset); + ray.tfar = vfloat::template gather<1>(valid, tfar, offset); + ray.tnear() = tnear ? vfloat::template gather<1>(valid, tnear, offset) : vfloat(zero); + ray.time() = time ? vfloat::template gather<1>(valid, time, offset) : vfloat(zero); + ray.mask = mask ? vint::template gather<1>(valid, (int*)mask, offset) : vint(-1); + ray.id = id ? vint::template gather<1>(valid, (int*)id, offset) : vint(-1); + ray.flags = flags ? vint::template gather<1>(valid, (int*)flags, offset) : vint(-1); +#else + ray.org = zero; + ray.tnear() = zero; + ray.dir = zero; + ray.tfar = zero; + ray.time() = zero; + ray.mask = zero; + ray.id = zero; + ray.flags = zero; + + for (size_t k = 0; k < K; k++) + { + if (likely(valid[k])) + { + const size_t ofs = offset[k]; + + ray.org.x[k] = *(float* __restrict__)((char*)org_x + ofs); + ray.org.y[k] = *(float* __restrict__)((char*)org_y + ofs); + ray.org.z[k] = *(float* __restrict__)((char*)org_z + ofs); + ray.dir.x[k] = *(float* __restrict__)((char*)dir_x + ofs); + ray.dir.y[k] = *(float* __restrict__)((char*)dir_y + ofs); + ray.dir.z[k] = *(float* __restrict__)((char*)dir_z + ofs); + ray.tfar[k] = *(float* __restrict__)((char*)tfar + ofs); + ray.tnear()[k] = tnear ? *(float* __restrict__)((char*)tnear + ofs) : 0.0f; + ray.time()[k] = time ? *(float* __restrict__)((char*)time + ofs) : 0.0f; + ray.mask[k] = mask ? *(int* __restrict__)((char*)mask + ofs) : -1; + ray.id[k] = id ? *(int* __restrict__)((char*)id + ofs) : -1; + ray.flags[k] = flags ? *(int* __restrict__)((char*)flags + ofs) : -1; + } + } +#endif + + return ray; + } + + template + __forceinline void setHitByOffset(const vbool& valid_i, const vint& offset, const RayHitK& ray) + { + vbool valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat::template scatter<1>(valid, tfar, offset, ray.tfar); + + if (likely(Ng_x)) vfloat::template scatter<1>(valid, Ng_x, offset, ray.Ng.x); + if (likely(Ng_y)) vfloat::template scatter<1>(valid, Ng_y, offset, ray.Ng.y); + if (likely(Ng_z)) vfloat::template scatter<1>(valid, Ng_z, offset, ray.Ng.z); + vfloat::template scatter<1>(valid, u, offset, ray.u); + vfloat::template scatter<1>(valid, v, offset, ray.v); + vuint::template scatter<1>(valid, (unsigned int*)geomID, offset, ray.geomID); + vuint::template scatter<1>(valid, (unsigned int*)primID, offset, ray.primID); + + if (likely(instID[0])) { + vuint::template scatter<1>(valid, (unsigned int*)instID[0], offset, ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint::template scatter<1>(valid, (unsigned int*)instID[l], offset, ray.instID[l]); +#endif + } +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + const size_t ofs = offset[k]; + + *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k]; + + if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + ofs) = ray.Ng.x[k]; + if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + ofs) = ray.Ng.y[k]; + if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + ofs) = ray.Ng.z[k]; + *(float* __restrict__)((char*)u + ofs) = ray.u[k]; + *(float* __restrict__)((char*)v + ofs) = ray.v[k]; + *(unsigned int* __restrict__)((char*)primID + ofs) = ray.primID[k]; + *(unsigned int* __restrict__)((char*)geomID + ofs) = ray.geomID[k]; + + if (likely(instID[0])) { + *(unsigned int* __restrict__)((char*)instID[0] + ofs) = ray.instID[0][k]; +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l) + *(unsigned int* __restrict__)((char*)instID[l] + ofs) = ray.instID[l][k]; +#endif + } + } +#endif + } + } + + template + __forceinline void setHitByOffset(const vbool& valid_i, const vint& offset, const RayK& ray) + { + vbool valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat::template scatter<1>(valid, tfar, offset, ray.tfar); +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + const size_t ofs = offset[k]; + + *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k]; + } +#endif + } + } + + /* ray data */ + float* __restrict__ org_x; // x coordinate of ray origin + float* __restrict__ org_y; // y coordinate of ray origin + float* __restrict__ org_z; // z coordinate of ray origin + float* __restrict__ tnear; // start of ray segment (optional) + + float* __restrict__ dir_x; // x coordinate of ray direction + float* __restrict__ dir_y; // y coordinate of ray direction + float* __restrict__ dir_z; // z coordinate of ray direction + float* __restrict__ time; // time of this ray for motion blur (optional) + + float* __restrict__ tfar; // end of ray segment (set to hit distance) + unsigned int* __restrict__ mask; // used to mask out objects during traversal (optional) + unsigned int* __restrict__ id; // ray ID + unsigned int* __restrict__ flags; // ray flags + + /* hit data */ + float* __restrict__ Ng_x; // x coordinate of geometry normal (optional) + float* __restrict__ Ng_y; // y coordinate of geometry normal (optional) + float* __restrict__ Ng_z; // z coordinate of geometry normal (optional) + + float* __restrict__ u; // barycentric u coordinate of hit + float* __restrict__ v; // barycentric v coordinate of hit + + unsigned int* __restrict__ primID; // primitive ID + unsigned int* __restrict__ geomID; // geometry ID + unsigned int* __restrict__ instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID (optional) + }; + + + struct RayStreamAOS + { + __forceinline RayStreamAOS(void* rays) + : ptr((Ray*)rays) {} + + __forceinline Ray& getRayByOffset(size_t offset) + { + return *(Ray*)((char*)ptr + offset); + } + + template + __forceinline RayK getRayByOffset(const vint& offset); + + template + __forceinline RayK getRayByOffset(const vbool& valid, const vint& offset) + { + const vint valid_offset = select(valid, offset, vintx(zero)); + return getRayByOffset(valid_offset); + } + + template + __forceinline void setHitByOffset(const vbool& valid_i, const vint& offset, const RayHitK& ray) + { + vbool valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar); + vfloat::template scatter<1>(valid, &((RayHit*)ptr)->Ng.x, offset, ray.Ng.x); + vfloat::template scatter<1>(valid, &((RayHit*)ptr)->Ng.y, offset, ray.Ng.y); + vfloat::template scatter<1>(valid, &((RayHit*)ptr)->Ng.z, offset, ray.Ng.z); + vfloat::template scatter<1>(valid, &((RayHit*)ptr)->u, offset, ray.u); + vfloat::template scatter<1>(valid, &((RayHit*)ptr)->v, offset, ray.v); + vuint::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->primID, offset, ray.primID); + vuint::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->geomID, offset, ray.geomID); + + vuint::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[0], offset, ray.instID[0]); +#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1) + for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l) + vuint::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[l], offset, ray.instID[l]); +#endif +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + RayHit* __restrict__ ray_k = (RayHit*)((char*)ptr + offset[k]); + ray_k->tfar = ray.tfar[k]; + ray_k->Ng.x = ray.Ng.x[k]; + ray_k->Ng.y = ray.Ng.y[k]; + ray_k->Ng.z = ray.Ng.z[k]; + ray_k->u = ray.u[k]; + ray_k->v = ray.v[k]; + ray_k->primID = ray.primID[k]; + ray_k->geomID = ray.geomID[k]; + + instance_id_stack::copy(ray.instID, ray_k->instID, k); + } +#endif + } + } + + template + __forceinline void setHitByOffset(const vbool& valid_i, const vint& offset, const RayK& ray) + { + vbool valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + { +#if defined(__AVX512F__) + vfloat::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar); +#else + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + Ray* __restrict__ ray_k = (Ray*)((char*)ptr + offset[k]); + ray_k->tfar = ray.tfar[k]; + } +#endif + } + } + + Ray* __restrict__ ptr; + }; + + template<> + __forceinline Ray4 RayStreamAOS::getRayByOffset(const vint4& offset) + { + Ray4 ray; + + /* load and transpose: org.x, org.y, org.z, tnear */ + const vfloat4 a0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->org); + const vfloat4 a1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->org); + const vfloat4 a2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->org); + const vfloat4 a3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->org); + + transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear()); + + /* load and transpose: dir.x, dir.y, dir.z, time */ + const vfloat4 b0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->dir); + const vfloat4 b1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->dir); + const vfloat4 b2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->dir); + const vfloat4 b3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->dir); + + transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar); + const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar); + const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar); + const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar); + + vfloat4 maskf, idf, flagsf; + transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } + +#if defined(__AVX__) + template<> + __forceinline Ray8 RayStreamAOS::getRayByOffset(const vint8& offset) + { + Ray8 ray; + + /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ + const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[0]))->org); + const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[1]))->org); + const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[2]))->org); + const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[3]))->org); + const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[4]))->org); + const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[5]))->org); + const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[6]))->org); + const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[7]))->org); + + transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar); + const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar); + const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar); + const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar); + const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[4]))->tfar); + const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[5]))->tfar); + const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[6]))->tfar); + const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[7]))->tfar); + + vfloat8 maskf, idf, flagsf; + transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } +#endif + +#if defined(__AVX512F__) + template<> + __forceinline Ray16 RayStreamAOS::getRayByOffset(const vint16& offset) + { + Ray16 ray; + + /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ + const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 0]))->org); + const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 1]))->org); + const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 2]))->org); + const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 3]))->org); + const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 4]))->org); + const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 5]))->org); + const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 6]))->org); + const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 7]))->org); + const vfloat8 ab8 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 8]))->org); + const vfloat8 ab9 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 9]))->org); + const vfloat8 ab10 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[10]))->org); + const vfloat8 ab11 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[11]))->org); + const vfloat8 ab12 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[12]))->org); + const vfloat8 ab13 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[13]))->org); + const vfloat8 ab14 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[14]))->org); + const vfloat8 ab15 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[15]))->org); + + transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15, + ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 0]))->tfar); + const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 1]))->tfar); + const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 2]))->tfar); + const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 3]))->tfar); + const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 4]))->tfar); + const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 5]))->tfar); + const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 6]))->tfar); + const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 7]))->tfar); + const vfloat4 c8 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 8]))->tfar); + const vfloat4 c9 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 9]))->tfar); + const vfloat4 c10 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[10]))->tfar); + const vfloat4 c11 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[11]))->tfar); + const vfloat4 c12 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[12]))->tfar); + const vfloat4 c13 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[13]))->tfar); + const vfloat4 c14 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[14]))->tfar); + const vfloat4 c15 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[15]))->tfar); + + vfloat16 maskf, idf, flagsf; + transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15, + ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } +#endif + + + struct RayStreamAOP + { + __forceinline RayStreamAOP(void* rays) + : ptr((Ray**)rays) {} + + __forceinline Ray& getRayByIndex(size_t index) + { + return *ptr[index]; + } + + template + __forceinline RayK getRayByIndex(const vint& index); + + template + __forceinline RayK getRayByIndex(const vbool& valid, const vint& index) + { + const vint valid_index = select(valid, index, vintx(zero)); + return getRayByIndex(valid_index); + } + + template + __forceinline void setHitByIndex(const vbool& valid_i, const vint& index, const RayHitK& ray) + { + vbool valid = valid_i; + valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID); + + if (likely(any(valid))) + { + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + RayHit* __restrict__ ray_k = (RayHit*)ptr[index[k]]; + + ray_k->tfar = ray.tfar[k]; + ray_k->Ng.x = ray.Ng.x[k]; + ray_k->Ng.y = ray.Ng.y[k]; + ray_k->Ng.z = ray.Ng.z[k]; + ray_k->u = ray.u[k]; + ray_k->v = ray.v[k]; + ray_k->primID = ray.primID[k]; + ray_k->geomID = ray.geomID[k]; + instance_id_stack::copy(ray.instID, ray_k->instID, k); + } + } + } + + template + __forceinline void setHitByIndex(const vbool& valid_i, const vint& index, const RayK& ray) + { + vbool valid = valid_i; + valid &= (ray.tfar < 0.0f); + + if (likely(any(valid))) + { + size_t valid_bits = movemask(valid); + while (valid_bits != 0) + { + const size_t k = bscf(valid_bits); + Ray* __restrict__ ray_k = ptr[index[k]]; + + ray_k->tfar = ray.tfar[k]; + } + } + } + + Ray** __restrict__ ptr; + }; + + template<> + __forceinline Ray4 RayStreamAOP::getRayByIndex(const vint4& index) + { + Ray4 ray; + + /* load and transpose: org.x, org.y, org.z, tnear */ + const vfloat4 a0 = vfloat4::loadu(&ptr[index[0]]->org); + const vfloat4 a1 = vfloat4::loadu(&ptr[index[1]]->org); + const vfloat4 a2 = vfloat4::loadu(&ptr[index[2]]->org); + const vfloat4 a3 = vfloat4::loadu(&ptr[index[3]]->org); + + transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear()); + + /* load and transpose: dir.x, dir.y, dir.z, time */ + const vfloat4 b0 = vfloat4::loadu(&ptr[index[0]]->dir); + const vfloat4 b1 = vfloat4::loadu(&ptr[index[1]]->dir); + const vfloat4 b2 = vfloat4::loadu(&ptr[index[2]]->dir); + const vfloat4 b3 = vfloat4::loadu(&ptr[index[3]]->dir); + + transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar); + const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar); + const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar); + const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar); + + vfloat4 maskf, idf, flagsf; + transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } + +#if defined(__AVX__) + template<> + __forceinline Ray8 RayStreamAOP::getRayByIndex(const vint8& index) + { + Ray8 ray; + + /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ + const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org); + const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org); + const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org); + const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org); + const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org); + const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org); + const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org); + const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org); + + transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar); + const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar); + const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar); + const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar); + const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar); + const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar); + const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar); + const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar); + + vfloat8 maskf, idf, flagsf; + transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf); + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } +#endif + +#if defined(__AVX512F__) + template<> + __forceinline Ray16 RayStreamAOP::getRayByIndex(const vint16& index) + { + Ray16 ray; + + /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */ + const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org); + const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org); + const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org); + const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org); + const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org); + const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org); + const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org); + const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org); + const vfloat8 ab8 = vfloat8::loadu(&ptr[index[8]]->org); + const vfloat8 ab9 = vfloat8::loadu(&ptr[index[9]]->org); + const vfloat8 ab10 = vfloat8::loadu(&ptr[index[10]]->org); + const vfloat8 ab11 = vfloat8::loadu(&ptr[index[11]]->org); + const vfloat8 ab12 = vfloat8::loadu(&ptr[index[12]]->org); + const vfloat8 ab13 = vfloat8::loadu(&ptr[index[13]]->org); + const vfloat8 ab14 = vfloat8::loadu(&ptr[index[14]]->org); + const vfloat8 ab15 = vfloat8::loadu(&ptr[index[15]]->org); + + transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15, + ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time()); + + /* load and transpose: tfar, mask, id, flags */ + const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar); + const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar); + const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar); + const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar); + const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar); + const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar); + const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar); + const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar); + const vfloat4 c8 = vfloat4::loadu(&ptr[index[8]]->tfar); + const vfloat4 c9 = vfloat4::loadu(&ptr[index[9]]->tfar); + const vfloat4 c10 = vfloat4::loadu(&ptr[index[10]]->tfar); + const vfloat4 c11 = vfloat4::loadu(&ptr[index[11]]->tfar); + const vfloat4 c12 = vfloat4::loadu(&ptr[index[12]]->tfar); + const vfloat4 c13 = vfloat4::loadu(&ptr[index[13]]->tfar); + const vfloat4 c14 = vfloat4::loadu(&ptr[index[14]]->tfar); + const vfloat4 c15 = vfloat4::loadu(&ptr[index[15]]->tfar); + + vfloat16 maskf, idf, flagsf; + transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15, + ray.tfar, maskf, idf, flagsf); + + ray.mask = asInt(maskf); + ray.id = asInt(idf); + ray.flags = asInt(flagsf); + + return ray; + } +#endif +} diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp new file mode 100644 index 00000000000..625fbf6d4f2 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp @@ -0,0 +1,1799 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#define RTC_EXPORT_API + +#include "default.h" +#include "device.h" +#include "scene.h" +#include "context.h" +#include "../../include/embree3/rtcore_ray.h" + +#if defined(__aarch64__) && defined(BUILD_IOS) +#include +#endif + +using namespace embree; + +RTC_NAMESPACE_BEGIN; + + /* mutex to make API thread safe */ +#if defined(__aarch64__) && defined(BUILD_IOS) + static std::mutex g_mutex; +#else + static MutexSys g_mutex; +#endif + + RTC_API RTCDevice rtcNewDevice(const char* config) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewDevice); +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(g_mutex); +#else + Lock lock(g_mutex); +#endif + Device* device = new Device(config); + return (RTCDevice) device->refInc(); + RTC_CATCH_END(nullptr); + return (RTCDevice) nullptr; + } + + RTC_API void rtcRetainDevice(RTCDevice hdevice) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainDevice); + RTC_VERIFY_HANDLE(hdevice); +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(g_mutex); +#else + Lock lock(g_mutex); +#endif + device->refInc(); + RTC_CATCH_END(nullptr); + } + + RTC_API void rtcReleaseDevice(RTCDevice hdevice) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseDevice); + RTC_VERIFY_HANDLE(hdevice); +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(g_mutex); +#else + Lock lock(g_mutex); +#endif + device->refDec(); + RTC_CATCH_END(nullptr); + } + + RTC_API ssize_t rtcGetDeviceProperty(RTCDevice hdevice, RTCDeviceProperty prop) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetDeviceProperty); + RTC_VERIFY_HANDLE(hdevice); +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(g_mutex); +#else + Lock lock(g_mutex); +#endif + return device->getProperty(prop); + RTC_CATCH_END(device); + return 0; + } + + RTC_API void rtcSetDeviceProperty(RTCDevice hdevice, const RTCDeviceProperty prop, ssize_t val) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetDeviceProperty); + const bool internal_prop = (size_t)prop >= 1000000 && (size_t)prop < 1000004; + if (!internal_prop) RTC_VERIFY_HANDLE(hdevice); // allow NULL device for special internal settings +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(g_mutex); +#else + Lock lock(g_mutex); +#endif + device->setProperty(prop,val); + RTC_CATCH_END(device); + } + + RTC_API RTCError rtcGetDeviceError(RTCDevice hdevice) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetDeviceError); + if (device == nullptr) return Device::getThreadErrorCode(); + else return device->getDeviceErrorCode(); + RTC_CATCH_END(device); + return RTC_ERROR_UNKNOWN; + } + + RTC_API void rtcSetDeviceErrorFunction(RTCDevice hdevice, RTCErrorFunction error, void* userPtr) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetDeviceErrorFunction); + RTC_VERIFY_HANDLE(hdevice); + device->setErrorFunction(error, userPtr); + RTC_CATCH_END(device); + } + + RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice hdevice, RTCMemoryMonitorFunction memoryMonitor, void* userPtr) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetDeviceMemoryMonitorFunction); + device->setMemoryMonitorFunction(memoryMonitor, userPtr); + RTC_CATCH_END(device); + } + + RTC_API RTCBuffer rtcNewBuffer(RTCDevice hdevice, size_t byteSize) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewBuffer); + RTC_VERIFY_HANDLE(hdevice); + Buffer* buffer = new Buffer((Device*)hdevice, byteSize); + return (RTCBuffer)buffer->refInc(); + RTC_CATCH_END((Device*)hdevice); + return nullptr; + } + + RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice hdevice, void* ptr, size_t byteSize) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewSharedBuffer); + RTC_VERIFY_HANDLE(hdevice); + Buffer* buffer = new Buffer((Device*)hdevice, byteSize, ptr); + return (RTCBuffer)buffer->refInc(); + RTC_CATCH_END((Device*)hdevice); + return nullptr; + } + + RTC_API void* rtcGetBufferData(RTCBuffer hbuffer) + { + Buffer* buffer = (Buffer*)hbuffer; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetBufferData); + RTC_VERIFY_HANDLE(hbuffer); + return buffer->data(); + RTC_CATCH_END2(buffer); + return nullptr; + } + + RTC_API void rtcRetainBuffer(RTCBuffer hbuffer) + { + Buffer* buffer = (Buffer*)hbuffer; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainBuffer); + RTC_VERIFY_HANDLE(hbuffer); + buffer->refInc(); + RTC_CATCH_END2(buffer); + } + + RTC_API void rtcReleaseBuffer(RTCBuffer hbuffer) + { + Buffer* buffer = (Buffer*)hbuffer; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseBuffer); + RTC_VERIFY_HANDLE(hbuffer); + buffer->refDec(); + RTC_CATCH_END2(buffer); + } + + RTC_API RTCScene rtcNewScene (RTCDevice hdevice) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewScene); + RTC_VERIFY_HANDLE(hdevice); + Scene* scene = new Scene((Device*)hdevice); + return (RTCScene) scene->refInc(); + RTC_CATCH_END((Device*)hdevice); + return nullptr; + } + + RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetSceneDevice); + RTC_VERIFY_HANDLE(hscene); + return (RTCDevice)scene->device->refInc(); // user will own one additional device reference + RTC_CATCH_END2(scene); + return (RTCDevice)nullptr; + } + + RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene hscene, RTCProgressMonitorFunction progress, void* ptr) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetSceneProgressMonitorFunction); + RTC_VERIFY_HANDLE(hscene); +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(g_mutex); +#else + Lock lock(g_mutex); +#endif + scene->setProgressMonitorFunction(progress,ptr); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcSetSceneBuildQuality (RTCScene hscene, RTCBuildQuality quality) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetSceneBuildQuality); + RTC_VERIFY_HANDLE(hscene); + if (quality != RTC_BUILD_QUALITY_LOW && + quality != RTC_BUILD_QUALITY_MEDIUM && + quality != RTC_BUILD_QUALITY_HIGH) + // -- GODOT start -- + // throw std::runtime_error("invalid build quality"); + abort(); + // -- GODOT end -- + scene->setBuildQuality(quality); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcSetSceneFlags (RTCScene hscene, RTCSceneFlags flags) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetSceneFlags); + RTC_VERIFY_HANDLE(hscene); + scene->setSceneFlags(flags); + RTC_CATCH_END2(scene); + } + + RTC_API RTCSceneFlags rtcGetSceneFlags(RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetSceneFlags); + RTC_VERIFY_HANDLE(hscene); + return scene->getSceneFlags(); + RTC_CATCH_END2(scene); + return RTC_SCENE_FLAG_NONE; + } + + RTC_API void rtcCommitScene (RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcCommitScene); + RTC_VERIFY_HANDLE(hscene); + scene->commit(false); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcJoinCommitScene (RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcJoinCommitScene); + RTC_VERIFY_HANDLE(hscene); + scene->commit(true); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcGetSceneBounds(RTCScene hscene, RTCBounds* bounds_o) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetSceneBounds); + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + BBox3fa bounds = scene->bounds.bounds(); + bounds_o->lower_x = bounds.lower.x; + bounds_o->lower_y = bounds.lower.y; + bounds_o->lower_z = bounds.lower.z; + bounds_o->align0 = 0; + bounds_o->upper_x = bounds.upper.x; + bounds_o->upper_y = bounds.upper.y; + bounds_o->upper_z = bounds.upper.z; + bounds_o->align1 = 0; + RTC_CATCH_END2(scene); + } + + RTC_API void rtcGetSceneLinearBounds(RTCScene hscene, RTCLinearBounds* bounds_o) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetSceneBounds); + RTC_VERIFY_HANDLE(hscene); + if (bounds_o == nullptr) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid destination pointer"); + if (scene->isModified()) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + + bounds_o->bounds0.lower_x = scene->bounds.bounds0.lower.x; + bounds_o->bounds0.lower_y = scene->bounds.bounds0.lower.y; + bounds_o->bounds0.lower_z = scene->bounds.bounds0.lower.z; + bounds_o->bounds0.align0 = 0; + bounds_o->bounds0.upper_x = scene->bounds.bounds0.upper.x; + bounds_o->bounds0.upper_y = scene->bounds.bounds0.upper.y; + bounds_o->bounds0.upper_z = scene->bounds.bounds0.upper.z; + bounds_o->bounds0.align1 = 0; + bounds_o->bounds1.lower_x = scene->bounds.bounds1.lower.x; + bounds_o->bounds1.lower_y = scene->bounds.bounds1.lower.y; + bounds_o->bounds1.lower_z = scene->bounds.bounds1.lower.z; + bounds_o->bounds1.align0 = 0; + bounds_o->bounds1.upper_x = scene->bounds.bounds1.upper.x; + bounds_o->bounds1.upper_y = scene->bounds.bounds1.upper.y; + bounds_o->bounds1.upper_z = scene->bounds.bounds1.upper.z; + bounds_o->bounds1.align1 = 0; + RTC_CATCH_END2(scene); + } + + RTC_API void rtcCollide (RTCScene hscene0, RTCScene hscene1, RTCCollideFunc callback, void* userPtr) + { + Scene* scene0 = (Scene*) hscene0; + Scene* scene1 = (Scene*) hscene1; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcCollide); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene0); + RTC_VERIFY_HANDLE(hscene1); + if (scene0->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (scene1->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (scene0->device != scene1->device) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes are from different devices"); + auto nUserPrims0 = scene0->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false); + auto nUserPrims1 = scene1->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false); + if (scene0->numPrimitives() != nUserPrims0 && scene1->numPrimitives() != nUserPrims1) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes must only contain user geometries with a single timestep"); +#endif + scene0->intersectors.collide(scene0,scene1,callback,userPtr); + RTC_CATCH_END(scene0->device); + } + + inline bool pointQuery(Scene* scene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr) + { + bool changed = false; + if (userContext->instStackSize > 0) + { + const AffineSpace3fa transform = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]); + + float similarityScale = 0.f; + const bool similtude = similarityTransform(transform, &similarityScale); + assert((similtude && similarityScale > 0) || (!similtude && similarityScale == 0.f)); + + PointQuery query_inst; + query_inst.p = xfmPoint(transform, Vec3fa(query->x, query->y, query->z)); + query_inst.radius = query->radius * similarityScale; + query_inst.time = query->time; + + PointQueryContext context_inst(scene, (PointQuery*)query, + similtude ? POINT_QUERY_TYPE_SPHERE : POINT_QUERY_TYPE_AABB, + queryFunc, userContext, similarityScale, userPtr); + changed = scene->intersectors.pointQuery((PointQuery*)&query_inst, &context_inst); + } + else + { + PointQueryContext context(scene, (PointQuery*)query, + POINT_QUERY_TYPE_SPHERE, queryFunc, userContext, 1.f, userPtr); + changed = scene->intersectors.pointQuery((PointQuery*)query, &context); + } + return changed; + } + + RTC_API bool rtcPointQuery(RTCScene hscene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcPointQuery); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_HANDLE(userContext); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); + if (((size_t)userContext) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "context not aligned to 16 bytes"); +#endif + + return pointQuery(scene, query, userContext, queryFunc, userPtr); + RTC_CATCH_END2_FALSE(scene); + } + + RTC_API bool rtcPointQuery4 (const int* valid, RTCScene hscene, RTCPointQuery4* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcPointQuery4); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(point_query.travs,cnt,cnt,cnt); + + bool changed = false; + PointQuery4* query4 = (PointQuery4*)query; + PointQuery query1; + for (size_t i=0; i<4; i++) { + if (!valid[i]) continue; + query4->get(i,query1); + changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL); + query4->set(i,query1); + } + return changed; + RTC_CATCH_END2_FALSE(scene); + } + + RTC_API bool rtcPointQuery8 (const int* valid, RTCScene hscene, RTCPointQuery8* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcPointQuery8); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(point_query.travs,cnt,cnt,cnt); + + bool changed = false; + PointQuery8* query8 = (PointQuery8*)query; + PointQuery query1; + for (size_t i=0; i<8; i++) { + if (!valid[i]) continue; + query8->get(i,query1); + changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL); + query8->set(i,query1); + } + return changed; + RTC_CATCH_END2_FALSE(scene); + } + + RTC_API bool rtcPointQuery16 (const int* valid, RTCScene hscene, RTCPointQuery16* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcPointQuery16); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(point_query.travs,cnt,cnt,cnt); + + bool changed = false; + PointQuery16* query16 = (PointQuery16*)query; + PointQuery query1; + for (size_t i=0; i<16; i++) { + if (!valid[i]) continue; + PointQuery query1; query16->get(i,query1); + changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL); + query16->set(i,query1); + } + return changed; + RTC_CATCH_END2_FALSE(scene); + } + + RTC_API void rtcIntersect1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect1); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes"); +#endif + STAT3(normal.travs,1,1,1); + IntersectContext context(scene,user_context); + scene->intersectors.intersect(*rayhit,&context); +#if defined(DEBUG) + ((RayHit*)rayhit)->verifyHit(); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit4* rayhit) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect4); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(normal.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + RayHit4* rayhit4 = (RayHit4*)rayhit; + for (size_t i=0; i<4; i++) { + if (!valid[i]) continue; + RayHit ray1; rayhit4->get(i,ray1); + scene->intersectors.intersect((RTCRayHit&)ray1,&context); + rayhit4->set(i,ray1); + } +#else + scene->intersectors.intersect4(valid,*rayhit,&context); +#endif + + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit8* rayhit) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect8); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes"); + if (((size_t)rayhit) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 32 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(normal.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + RayHit8* rayhit8 = (RayHit8*) rayhit; + for (size_t i=0; i<8; i++) { + if (!valid[i]) continue; + RayHit ray1; rayhit8->get(i,ray1); + scene->intersectors.intersect((RTCRayHit&)ray1,&context); + rayhit8->set(i,ray1); + } +#else + if (likely(scene->intersectors.intersector8)) + scene->intersectors.intersect8(valid,*rayhit,&context); + else + scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,8,1,sizeof(RTCRayHit8),&context); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit16* rayhit) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect16); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes"); + if (((size_t)rayhit) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 64 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(normal.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + RayHit16* rayhit16 = (RayHit16*) rayhit; + for (size_t i=0; i<16; i++) { + if (!valid[i]) continue; + RayHit ray1; rayhit16->get(i,ray1); + scene->intersectors.intersect((RTCRayHit&)ray1,&context); + rayhit16->set(i,ray1); + } +#else + if (likely(scene->intersectors.intersector16)) + scene->intersectors.intersect16(valid,*rayhit,&context); + else + scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,16,1,sizeof(RTCRayHit16),&context); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect1M (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit, unsigned int M, size_t byteStride) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect1M); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rayhit ) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(normal.travs,M,M,M); + IntersectContext context(scene,user_context); + + /* fast codepath for single rays */ + if (likely(M == 1)) { + if (likely(rayhit->ray.tnear <= rayhit->ray.tfar)) + scene->intersectors.intersect(*rayhit,&context); + } + + /* codepath for streams */ + else { + scene->device->rayStreamFilters.intersectAOS(scene,rayhit,M,byteStride,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1M not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersect1Mp (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit** rn, unsigned int M) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersect1Mp); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rn) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(normal.travs,M,M,M); + IntersectContext context(scene,user_context); + + /* fast codepath for single rays */ + if (likely(M == 1)) { + if (likely(rn[0]->ray.tnear <= rn[0]->ray.tfar)) + scene->intersectors.intersect(*rn[0],&context); + } + + /* codepath for streams */ + else { + scene->device->rayStreamFilters.intersectAOP(scene,rn,M,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1Mp not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersectNM (RTCScene hscene, RTCIntersectContext* user_context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersectNM); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rayhit) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(normal.travs,N*M,N*M,N*M); + IntersectContext context(scene,user_context); + + /* code path for single ray streams */ + if (likely(N == 1)) + { + /* fast code path for streams of size 1 */ + if (likely(M == 1)) { + if (likely(((RTCRayHit*)rayhit)->ray.tnear <= ((RTCRayHit*)rayhit)->ray.tfar)) + scene->intersectors.intersect(*(RTCRayHit*)rayhit,&context); + } + /* normal codepath for single ray streams */ + else { + scene->device->rayStreamFilters.intersectAOS(scene,(RTCRayHit*)rayhit,M,byteStride,&context); + } + } + /* code path for ray packet streams */ + else { + scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,N,M,byteStride,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNM not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcIntersectNp (RTCScene hscene, RTCIntersectContext* user_context, const RTCRayHitNp* rayhit, unsigned int N) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcIntersectNp); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)rayhit->ray.org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_x not aligned to 4 bytes"); + if (((size_t)rayhit->ray.org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_y not aligned to 4 bytes"); + if (((size_t)rayhit->ray.org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_z not aligned to 4 bytes"); + if (((size_t)rayhit->ray.dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes"); + if (((size_t)rayhit->ray.dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_y not aligned to 4 bytes"); + if (((size_t)rayhit->ray.dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_z not aligned to 4 bytes"); + if (((size_t)rayhit->ray.tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes"); + if (((size_t)rayhit->ray.tfar ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.tnear not aligned to 4 bytes"); + if (((size_t)rayhit->ray.time ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.time not aligned to 4 bytes"); + if (((size_t)rayhit->ray.mask ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.mask not aligned to 4 bytes"); + if (((size_t)rayhit->hit.Ng_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_x not aligned to 4 bytes"); + if (((size_t)rayhit->hit.Ng_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_y not aligned to 4 bytes"); + if (((size_t)rayhit->hit.Ng_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_z not aligned to 4 bytes"); + if (((size_t)rayhit->hit.u ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.u not aligned to 4 bytes"); + if (((size_t)rayhit->hit.v ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.v not aligned to 4 bytes"); + if (((size_t)rayhit->hit.geomID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.geomID not aligned to 4 bytes"); + if (((size_t)rayhit->hit.primID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.primID not aligned to 4 bytes"); + if (((size_t)rayhit->hit.instID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.instID not aligned to 4 bytes"); +#endif + STAT3(normal.travs,N,N,N); + IntersectContext context(scene,user_context); + scene->device->rayStreamFilters.intersectSOP(scene,rayhit,N,&context); +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNp not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded1); + STAT3(shadow.travs,1,1,1); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes"); +#endif + IntersectContext context(scene,user_context); + scene->intersectors.occluded(*ray,&context); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay4* ray) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded4); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes"); + if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(shadow.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + Ray4* ray4 = (Ray4*) ray; + for (size_t i=0; i<4; i++) { + if (!valid[i]) continue; + Ray ray1; ray4->get(i,ray1); + scene->intersectors.occluded((RTCRay&)ray1,&context); + ray4->set(i,ray1); + } +#else + scene->intersectors.occluded4(valid,*ray,&context); +#endif + + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay8* ray) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded8); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes"); + if (((size_t)ray) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 32 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(shadow.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + Ray8* ray8 = (Ray8*) ray; + for (size_t i=0; i<8; i++) { + if (!valid[i]) continue; + Ray ray1; ray8->get(i,ray1); + scene->intersectors.occluded((RTCRay&)ray1,&context); + ray8->set(i,ray1); + } +#else + if (likely(scene->intersectors.intersector8)) + scene->intersectors.occluded8(valid,*ray,&context); + else + scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,8,1,sizeof(RTCRay8),&context); +#endif + + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay16* ray) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded16); + +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes"); + if (((size_t)ray) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 64 bytes"); +#endif + STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;); + STAT3(shadow.travs,cnt,cnt,cnt); + + IntersectContext context(scene,user_context); +#if !defined(EMBREE_RAY_PACKETS) + Ray16* ray16 = (Ray16*) ray; + for (size_t i=0; i<16; i++) { + if (!valid[i]) continue; + Ray ray1; ray16->get(i,ray1); + scene->intersectors.occluded((RTCRay&)ray1,&context); + ray16->set(i,ray1); + } +#else + if (likely(scene->intersectors.intersector16)) + scene->intersectors.occluded16(valid,*ray,&context); + else + scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,16,1,sizeof(RTCRay16),&context); +#endif + + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded1M(RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray, unsigned int M, size_t byteStride) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded1M); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(shadow.travs,M,M,M); + IntersectContext context(scene,user_context); + /* fast codepath for streams of size 1 */ + if (likely(M == 1)) { + if (likely(ray->tnear <= ray->tfar)) + scene->intersectors.occluded (*ray,&context); + } + /* codepath for normal streams */ + else { + scene->device->rayStreamFilters.occludedAOS(scene,ray,M,byteStride,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1M not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccluded1Mp(RTCScene hscene, RTCIntersectContext* user_context, RTCRay** ray, unsigned int M) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccluded1Mp); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(shadow.travs,M,M,M); + IntersectContext context(scene,user_context); + + /* fast codepath for streams of size 1 */ + if (likely(M == 1)) { + if (likely(ray[0]->tnear <= ray[0]->tfar)) + scene->intersectors.occluded (*ray[0],&context); + } + /* codepath for normal streams */ + else { + scene->device->rayStreamFilters.occludedAOP(scene,ray,M,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1Mp not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccludedNM(RTCScene hscene, RTCIntersectContext* user_context, RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccludedNM); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (byteStride < sizeof(RTCRayHit)) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"byteStride too small"); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes"); +#endif + STAT3(shadow.travs,N*M,N*N,N*N); + IntersectContext context(scene,user_context); + + /* codepath for single rays */ + if (likely(N == 1)) + { + /* fast path for streams of size 1 */ + if (likely(M == 1)) { + if (likely(((RTCRay*)ray)->tnear <= ((RTCRay*)ray)->tfar)) + scene->intersectors.occluded (*(RTCRay*)ray,&context); + } + /* codepath for normal ray streams */ + else { + scene->device->rayStreamFilters.occludedAOS(scene,(RTCRay*)ray,M,byteStride,&context); + } + } + /* code path for ray packet streams */ + else { + scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,N,M,byteStride,&context); + } +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNM not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcOccludedNp(RTCScene hscene, RTCIntersectContext* user_context, const RTCRayNp* ray, unsigned int N) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcOccludedNp); + +#if defined (EMBREE_RAY_PACKETS) +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); + if (((size_t)ray->org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_x not aligned to 4 bytes"); + if (((size_t)ray->org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_y not aligned to 4 bytes"); + if (((size_t)ray->org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_z not aligned to 4 bytes"); + if (((size_t)ray->dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes"); + if (((size_t)ray->dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_y not aligned to 4 bytes"); + if (((size_t)ray->dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_z not aligned to 4 bytes"); + if (((size_t)ray->tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes"); + if (((size_t)ray->tfar ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "tnear not aligned to 4 bytes"); + if (((size_t)ray->time ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "time not aligned to 4 bytes"); + if (((size_t)ray->mask ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 4 bytes"); +#endif + STAT3(shadow.travs,N,N,N); + IntersectContext context(scene,user_context); + scene->device->rayStreamFilters.occludedSOP(scene,ray,N,&context); +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNp not supported"); +#endif + RTC_CATCH_END2(scene); + } + + RTC_API void rtcRetainScene (RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainScene); + RTC_VERIFY_HANDLE(hscene); + scene->refInc(); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcReleaseScene (RTCScene hscene) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseScene); + RTC_VERIFY_HANDLE(hscene); + scene->refDec(); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcSetGeometryInstancedScene(RTCGeometry hgeometry, RTCScene hscene) + { + Geometry* geometry = (Geometry*) hgeometry; + Ref scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryInstancedScene); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_HANDLE(hscene); + geometry->setInstancedScene(scene); + RTC_CATCH_END2(geometry); + } + + AffineSpace3fa loadTransform(RTCFormat format, const float* xfm) + { + AffineSpace3fa space = one; + switch (format) + { + case RTC_FORMAT_FLOAT3X4_ROW_MAJOR: + space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 4], xfm[ 8]), + Vec3fa(xfm[ 1], xfm[ 5], xfm[ 9]), + Vec3fa(xfm[ 2], xfm[ 6], xfm[10]), + Vec3fa(xfm[ 3], xfm[ 7], xfm[11])); + break; + + case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR: + space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]), + Vec3fa(xfm[ 3], xfm[ 4], xfm[ 5]), + Vec3fa(xfm[ 6], xfm[ 7], xfm[ 8]), + Vec3fa(xfm[ 9], xfm[10], xfm[11])); + break; + + case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR: + space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]), + Vec3fa(xfm[ 4], xfm[ 5], xfm[ 6]), + Vec3fa(xfm[ 8], xfm[ 9], xfm[10]), + Vec3fa(xfm[12], xfm[13], xfm[14])); + break; + + default: + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format"); + break; + } + return space; + } + + void storeTransform(const AffineSpace3fa& space, RTCFormat format, float* xfm) + { + switch (format) + { + case RTC_FORMAT_FLOAT3X4_ROW_MAJOR: + xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vy.x; xfm[ 2] = space.l.vz.x; xfm[ 3] = space.p.x; + xfm[ 4] = space.l.vx.y; xfm[ 5] = space.l.vy.y; xfm[ 6] = space.l.vz.y; xfm[ 7] = space.p.y; + xfm[ 8] = space.l.vx.z; xfm[ 9] = space.l.vy.z; xfm[10] = space.l.vz.z; xfm[11] = space.p.z; + break; + + case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR: + xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vx.y; xfm[ 2] = space.l.vx.z; + xfm[ 3] = space.l.vy.x; xfm[ 4] = space.l.vy.y; xfm[ 5] = space.l.vy.z; + xfm[ 6] = space.l.vz.x; xfm[ 7] = space.l.vz.y; xfm[ 8] = space.l.vz.z; + xfm[ 9] = space.p.x; xfm[10] = space.p.y; xfm[11] = space.p.z; + break; + + case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR: + xfm[ 0] = space.l.vx.x; xfm[ 1] = space.l.vx.y; xfm[ 2] = space.l.vx.z; xfm[ 3] = 0.f; + xfm[ 4] = space.l.vy.x; xfm[ 5] = space.l.vy.y; xfm[ 6] = space.l.vy.z; xfm[ 7] = 0.f; + xfm[ 8] = space.l.vz.x; xfm[ 9] = space.l.vz.y; xfm[10] = space.l.vz.z; xfm[11] = 0.f; + xfm[12] = space.p.x; xfm[13] = space.p.y; xfm[14] = space.p.z; xfm[15] = 1.f; + break; + + default: + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format"); + break; + } + } + + RTC_API void rtcSetGeometryTransform(RTCGeometry hgeometry, unsigned int timeStep, RTCFormat format, const void* xfm) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTransform); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_HANDLE(xfm); + const AffineSpace3fa transform = loadTransform(format, (const float*)xfm); + geometry->setTransform(transform, timeStep); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry hgeometry, unsigned int timeStep, const RTCQuaternionDecomposition* qd) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTransformQuaternion); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_HANDLE(qd); + + AffineSpace3fx transform; + transform.l.vx.x = qd->scale_x; + transform.l.vy.y = qd->scale_y; + transform.l.vz.z = qd->scale_z; + transform.l.vy.x = qd->skew_xy; + transform.l.vz.x = qd->skew_xz; + transform.l.vz.y = qd->skew_yz; + transform.l.vx.y = qd->translation_x; + transform.l.vx.z = qd->translation_y; + transform.l.vy.z = qd->translation_z; + transform.p.x = qd->shift_x; + transform.p.y = qd->shift_y; + transform.p.z = qd->shift_z; + + // normalize quaternion + Quaternion3f q(qd->quaternion_r, qd->quaternion_i, qd->quaternion_j, qd->quaternion_k); + q = normalize(q); + transform.l.vx.w = q.i; + transform.l.vy.w = q.j; + transform.l.vz.w = q.k; + transform.p.w = q.r; + + geometry->setQuaternionDecomposition(transform, timeStep); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcGetGeometryTransform(RTCGeometry hgeometry, float time, RTCFormat format, void* xfm) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryTransform); + const AffineSpace3fa transform = geometry->getTransform(time); + storeTransform(transform, format, (float*)xfm); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args) + { + IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i; + args->report(args,filter_args); + } + + RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args) + { + OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i; + args->report(args,filter_args); + } + + RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type) + { + Device* device = (Device*) hdevice; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewGeometry); + RTC_VERIFY_HANDLE(hdevice); + + switch (type) + { + case RTC_GEOMETRY_TYPE_TRIANGLE: + { +#if defined(EMBREE_GEOMETRY_TRIANGLE) + createTriangleMeshTy createTriangleMesh = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createTriangleMesh); + Geometry* geom = createTriangleMesh(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_TRIANGLE is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_QUAD: + { +#if defined(EMBREE_GEOMETRY_QUAD) + createQuadMeshTy createQuadMesh = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createQuadMesh); + Geometry* geom = createQuadMesh(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_QUAD is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_SPHERE_POINT: + case RTC_GEOMETRY_TYPE_DISC_POINT: + case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT: + { +#if defined(EMBREE_GEOMETRY_POINT) + createPointsTy createPoints = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_builder_cpu_features, createPoints); + + Geometry *geom; + switch(type) { + case RTC_GEOMETRY_TYPE_SPHERE_POINT: + geom = createPoints(device, Geometry::GTY_SPHERE_POINT); + break; + case RTC_GEOMETRY_TYPE_DISC_POINT: + geom = createPoints(device, Geometry::GTY_DISC_POINT); + break; + case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT: + geom = createPoints(device, Geometry::GTY_ORIENTED_DISC_POINT); + break; + default: + geom = nullptr; + break; + } + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_POINT is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE: + case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE: + + case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE: + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE: + + case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE: + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE: + + case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE: + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE: + + case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE: + case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE: + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE: + { +#if defined(EMBREE_GEOMETRY_CURVE) + createLineSegmentsTy createLineSegments = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createLineSegments); + createCurvesTy createCurves = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createCurves); + + Geometry* geom; + switch (type) { + case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_CONE_LINEAR_CURVE); break; + case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_ROUND_LINEAR_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_FLAT_LINEAR_CURVE); break; + //case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_LINEAR_CURVE : geom = createLineSegments (device,Geometry::GTY_ORIENTED_LINEAR_CURVE); break; + + case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_BEZIER_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_BEZIER_CURVE); break; + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BEZIER_CURVE); break; + + case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_BSPLINE_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_BSPLINE_CURVE); break; + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BSPLINE_CURVE); break; + + case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_HERMITE_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_HERMITE_CURVE); break; + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_HERMITE_CURVE); break; + + case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ROUND_CATMULL_ROM_CURVE); break; + case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_FLAT_CATMULL_ROM_CURVE); break; + case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_CATMULL_ROM_CURVE); break; + default: geom = nullptr; break; + } + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_CURVE is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_SUBDIVISION: + { +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + createSubdivMeshTy createSubdivMesh = nullptr; + SELECT_SYMBOL_DEFAULT_AVX(device->enabled_cpu_features,createSubdivMesh); + //SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createSubdivMesh); // FIXME: this does not work for some reason? + Geometry* geom = createSubdivMesh(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_SUBDIVISION is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_USER: + { +#if defined(EMBREE_GEOMETRY_USER) + createUserGeometryTy createUserGeometry = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createUserGeometry); + Geometry* geom = createUserGeometry(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_USER is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_INSTANCE: + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + createInstanceTy createInstance = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createInstance); + Geometry* geom = createInstance(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_INSTANCE is not supported"); +#endif + } + + case RTC_GEOMETRY_TYPE_GRID: + { +#if defined(EMBREE_GEOMETRY_GRID) + createGridMeshTy createGridMesh = nullptr; + SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createGridMesh); + Geometry* geom = createGridMesh(device); + return (RTCGeometry) geom->refInc(); +#else + throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_GRID is not supported"); +#endif + } + + default: + throw_RTCError(RTC_ERROR_UNKNOWN,"invalid geometry type"); + } + + RTC_CATCH_END(device); + return nullptr; + } + + RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry hgeometry, unsigned int userPrimitiveCount) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryUserPrimitiveCount); + RTC_VERIFY_HANDLE(hgeometry); + + if (unlikely(geometry->getType() != Geometry::GTY_USER_GEOMETRY)) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation only allowed for user geometries"); + + geometry->setNumPrimitives(userPrimitiveCount); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry hgeometry, unsigned int timeStepCount) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTimeStepCount); + RTC_VERIFY_HANDLE(hgeometry); + + if (timeStepCount > RTC_MAX_TIME_STEP_COUNT) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"number of time steps is out of range"); + + geometry->setNumTimeSteps(timeStepCount); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTimeRange(RTCGeometry hgeometry, float startTime, float endTime) + { + Ref geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTimeRange); + RTC_VERIFY_HANDLE(hgeometry); + + if (startTime > endTime) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"startTime has to be smaller or equal to the endTime"); + + geometry->setTimeRange(BBox1f(startTime,endTime)); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry hgeometry, unsigned int N) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryVertexAttributeCount); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setVertexAttributeCount(N); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTopologyCount(RTCGeometry hgeometry, unsigned int N) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTopologyCount); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setTopologyCount(N); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryBuildQuality (RTCGeometry hgeometry, RTCBuildQuality quality) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryBuildQuality); + RTC_VERIFY_HANDLE(hgeometry); + if (quality != RTC_BUILD_QUALITY_LOW && + quality != RTC_BUILD_QUALITY_MEDIUM && + quality != RTC_BUILD_QUALITY_HIGH && + quality != RTC_BUILD_QUALITY_REFIT) + // -- GODOT start -- + // throw std::runtime_error("invalid build quality"); + abort(); + // -- GODOT end -- + geometry->setBuildQuality(quality); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry hgeometry, float maxRadiusScale) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryMaxRadiusScale); + RTC_VERIFY_HANDLE(hgeometry); +#if RTC_MIN_WIDTH + if (maxRadiusScale < 1.0f) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximal radius scale has to be larger or equal to 1"); + geometry->setMaxRadiusScale(maxRadiusScale); +#else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"min-width feature is not enabled"); +#endif + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryMask (RTCGeometry hgeometry, unsigned int mask) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryMask); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setMask(mask); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometrySubdivisionMode (RTCGeometry hgeometry, unsigned topologyID, RTCSubdivisionMode mode) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometrySubdivisionMode); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setSubdivisionMode(topologyID,mode); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry hgeometry, unsigned int vertexAttributeID, unsigned int topologyID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryVertexAttributeTopology); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setVertexAttributeTopology(vertexAttributeID, topologyID); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, RTCBuffer hbuffer, size_t byteOffset, size_t byteStride, size_t itemCount) + { + Geometry* geometry = (Geometry*) hgeometry; + Ref buffer = (Buffer*)hbuffer; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryBuffer); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_HANDLE(hbuffer); + + if (geometry->device != buffer->device) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices"); + + if (itemCount > 0xFFFFFFFFu) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large"); + + geometry->setBuffer(type, slot, format, buffer, byteOffset, byteStride, (unsigned int)itemCount); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetSharedGeometryBuffer); + RTC_VERIFY_HANDLE(hgeometry); + + if (itemCount > 0xFFFFFFFFu) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large"); + + Ref buffer = new Buffer(geometry->device, itemCount*byteStride, (char*)ptr + byteOffset); + geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount); + RTC_CATCH_END2(geometry); + } + + RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, size_t byteStride, size_t itemCount) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetNewGeometryBuffer); + RTC_VERIFY_HANDLE(hgeometry); + + if (itemCount > 0xFFFFFFFFu) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large"); + + /* vertex buffers need to get overallocated slightly as elements are accessed using SSE loads */ + size_t bytes = itemCount*byteStride; + if (type == RTC_BUFFER_TYPE_VERTEX || type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) + bytes += (16 - (byteStride%16))%16; + + Ref buffer = new Buffer(geometry->device, bytes); + geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount); + return buffer->data(); + RTC_CATCH_END2(geometry); + return nullptr; + } + + RTC_API void* rtcGetGeometryBufferData(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryBufferData); + RTC_VERIFY_HANDLE(hgeometry); + return geometry->getBuffer(type, slot); + RTC_CATCH_END2(geometry); + return nullptr; + } + + RTC_API void rtcEnableGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcEnableGeometry); + RTC_VERIFY_HANDLE(hgeometry); + geometry->enable(); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcUpdateGeometryBuffer (RTCGeometry hgeometry, RTCBufferType type, unsigned int slot) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcUpdateGeometryBuffer); + RTC_VERIFY_HANDLE(hgeometry); + geometry->updateBuffer(type, slot); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcDisableGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcDisableGeometry); + RTC_VERIFY_HANDLE(hgeometry); + geometry->disable(); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryTessellationRate (RTCGeometry hgeometry, float tessellationRate) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryTessellationRate); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setTessellationRate(tessellationRate); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryUserData (RTCGeometry hgeometry, void* ptr) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryUserData); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setUserData(ptr); + RTC_CATCH_END2(geometry); + } + + RTC_API void* rtcGetGeometryUserData (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; // no ref counting here! + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryUserData); + RTC_VERIFY_HANDLE(hgeometry); + return geometry->getUserData(); + RTC_CATCH_END2(geometry); + return nullptr; + } + + RTC_API void rtcSetGeometryBoundsFunction (RTCGeometry hgeometry, RTCBoundsFunction bounds, void* userPtr) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryBoundsFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setBoundsFunction(bounds,userPtr); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryDisplacementFunction (RTCGeometry hgeometry, RTCDisplacementFunctionN displacement) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryDisplacementFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setDisplacementFunction(displacement); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryIntersectFunction (RTCGeometry hgeometry, RTCIntersectFunctionN intersect) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryIntersectFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setIntersectFunctionN(intersect); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry hgeometry, RTCPointQueryFunction pointQuery) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryPointQueryFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setPointQueryFunction(pointQuery); + RTC_CATCH_END2(geometry); + } + + RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry hgeometry, unsigned int faceID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryFirstHalfEdge); + return geometry->getFirstHalfEdge(faceID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API unsigned int rtcGetGeometryFace(RTCGeometry hgeometry, unsigned int edgeID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryFace); + return geometry->getFace(edgeID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry hgeometry, unsigned int edgeID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryNextHalfEdge); + return geometry->getNextHalfEdge(edgeID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry hgeometry, unsigned int edgeID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryPreviousHalfEdge); + return geometry->getPreviousHalfEdge(edgeID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry hgeometry, unsigned int topologyID, unsigned int edgeID) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryOppositeHalfEdge); + return geometry->getOppositeHalfEdge(topologyID,edgeID); + RTC_CATCH_END2(geometry); + return -1; + } + + RTC_API void rtcSetGeometryOccludedFunction (RTCGeometry hgeometry, RTCOccludedFunctionN occluded) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetOccludedFunctionN); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setOccludedFunctionN(occluded); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryIntersectFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryIntersectFilterFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setIntersectionFilterFunctionN(filter); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcSetGeometryOccludedFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcSetGeometryOccludedFilterFunction); + RTC_VERIFY_HANDLE(hgeometry); + geometry->setOcclusionFilterFunctionN(filter); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcInterpolate(const RTCInterpolateArguments* const args) + { + Geometry* geometry = (Geometry*) args->geometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcInterpolate); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(args->geometry); +#endif + geometry->interpolate(args); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcInterpolateN(const RTCInterpolateNArguments* const args) + { + Geometry* geometry = (Geometry*) args->geometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcInterpolateN); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(args->geometry); +#endif + geometry->interpolateN(args); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcCommitGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcCommitGeometry); + RTC_VERIFY_HANDLE(hgeometry); + return geometry->commit(); + RTC_CATCH_END2(geometry); + } + + RTC_API unsigned int rtcAttachGeometry (RTCScene hscene, RTCGeometry hgeometry) + { + Scene* scene = (Scene*) hscene; + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcAttachGeometry); + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_HANDLE(hgeometry); + if (scene->device != geometry->device) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices"); + return scene->bind(RTC_INVALID_GEOMETRY_ID,geometry); + RTC_CATCH_END2(scene); + return -1; + } + + RTC_API void rtcAttachGeometryByID (RTCScene hscene, RTCGeometry hgeometry, unsigned int geomID) + { + Scene* scene = (Scene*) hscene; + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcAttachGeometryByID); + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_HANDLE(hgeometry); + RTC_VERIFY_GEOMID(geomID); + if (scene->device != geometry->device) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices"); + scene->bind(geomID,geometry); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcDetachGeometry (RTCScene hscene, unsigned int geomID) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcDetachGeometry); + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_GEOMID(geomID); + scene->detachGeometry(geomID); + RTC_CATCH_END2(scene); + } + + RTC_API void rtcRetainGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainGeometry); + RTC_VERIFY_HANDLE(hgeometry); + geometry->refInc(); + RTC_CATCH_END2(geometry); + } + + RTC_API void rtcReleaseGeometry (RTCGeometry hgeometry) + { + Geometry* geometry = (Geometry*) hgeometry; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseGeometry); + RTC_VERIFY_HANDLE(hgeometry); + geometry->refDec(); + RTC_CATCH_END2(geometry); + } + + RTC_API RTCGeometry rtcGetGeometry (RTCScene hscene, unsigned int geomID) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometry); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_GEOMID(geomID); +#endif + return (RTCGeometry) scene->get(geomID); + RTC_CATCH_END2(scene); + return nullptr; + } + +RTC_NAMESPACE_END diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.h b/thirdparty/embree-aarch64/kernels/common/rtcore.h new file mode 100644 index 00000000000..4b070e122ba --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/rtcore.h @@ -0,0 +1,142 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../../include/embree3/rtcore.h" +RTC_NAMESPACE_USE + +namespace embree +{ + /*! decoding of intersection flags */ + __forceinline bool isCoherent (RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_COHERENT; } + __forceinline bool isIncoherent(RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT; } + +#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR >= 8) +# define USE_TASK_ARENA 1 +#else +# define USE_TASK_ARENA 0 +#endif + +#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION >= 11009) // TBB 2019 Update 9 +# define TASKING_TBB_USE_TASK_ISOLATION 1 +#else +# define TASKING_TBB_USE_TASK_ISOLATION 0 +#endif + +/*! Macros used in the rtcore API implementation */ +// -- GODOT start -- +// #define RTC_CATCH_BEGIN try { +#define RTC_CATCH_BEGIN + +// #define RTC_CATCH_END(device) \ +// } catch (std::bad_alloc&) { \ +// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +// } catch (rtcore_error& e) { \ +// Device::process_error(device,e.error,e.what()); \ +// } catch (std::exception& e) { \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +// } catch (...) { \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +// } +#define RTC_CATCH_END(device) + +// #define RTC_CATCH_END2(scene) \ +// } catch (std::bad_alloc&) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +// } catch (rtcore_error& e) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,e.error,e.what()); \ +// } catch (std::exception& e) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +// } catch (...) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +// } +#define RTC_CATCH_END2(scene) + +// #define RTC_CATCH_END2_FALSE(scene) \ +// } catch (std::bad_alloc&) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +// return false; \ +// } catch (rtcore_error& e) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,e.error,e.what()); \ +// return false; \ +// } catch (std::exception& e) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +// return false; \ +// } catch (...) { \ +// Device* device = scene ? scene->device : nullptr; \ +// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +// return false; \ +// } +#define RTC_CATCH_END2_FALSE(scene) return false; +// -- GODOT end -- + +#define RTC_VERIFY_HANDLE(handle) \ + if (handle == nullptr) { \ + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \ + } + +#define RTC_VERIFY_GEOMID(id) \ + if (id == RTC_INVALID_GEOMETRY_ID) { \ + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \ + } + +#define RTC_VERIFY_UPPER(id,upper) \ + if (id > upper) { \ + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \ + } + +#define RTC_VERIFY_RANGE(id,lower,upper) \ + if (id < lower || id > upper) \ + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"argument out of bounds"); + +#if 0 // enable to debug print all API calls +#define RTC_TRACE(x) std::cout << #x << std::endl; +#else +#define RTC_TRACE(x) +#endif + +// -- GODOT begin -- +// /*! used to throw embree API errors */ +// struct rtcore_error : public std::exception +// { +// __forceinline rtcore_error(RTCError error, const std::string& str) +// : error(error), str(str) {} +// +// ~rtcore_error() throw() {} +// +// const char* what () const throw () { +// return str.c_str(); +// } +// +// RTCError error; +// std::string str; +// }; +// -- GODOT end -- + +#if defined(DEBUG) // only report file and line in debug mode + // -- GODOT begin -- + // #define throw_RTCError(error,str) \ + // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); + #define throw_RTCError(error,str) \ + printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); + // -- GODOT end -- +#else + // -- GODOT begin -- + // #define throw_RTCError(error,str) \ + // throw rtcore_error(error,str); + #define throw_RTCError(error,str) \ + abort(); + // -- GODOT end -- +#endif + +#define RTC_BUILD_ARGUMENTS_HAS(settings,member) \ + (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) +} diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp new file mode 100644 index 00000000000..6bb96bba077 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp @@ -0,0 +1,442 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#define RTC_EXPORT_API + +#include "default.h" +#include "device.h" +#include "scene.h" +#include "context.h" +#include "alloc.h" + +#include "../builders/bvh_builder_sah.h" +#include "../builders/bvh_builder_morton.h" + +namespace embree +{ + namespace isa // FIXME: support more ISAs for builders + { + struct BVH : public RefCount + { + BVH (Device* device) + : device(device), allocator(device,true), morton_src(device,0), morton_tmp(device,0) + { + device->refInc(); + } + + ~BVH() { + device->refDec(); + } + + public: + Device* device; + FastAllocator allocator; + mvector morton_src; + mvector morton_tmp; + }; + + void* rtcBuildBVHMorton(const RTCBuildArguments* arguments) + { + BVH* bvh = (BVH*) arguments->bvh; + RTCBuildPrimitive* prims_i = arguments->primitives; + size_t primitiveCount = arguments->primitiveCount; + RTCCreateNodeFunction createNode = arguments->createNode; + RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren; + RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds; + RTCCreateLeafFunction createLeaf = arguments->createLeaf; + RTCProgressMonitorFunction buildProgress = arguments->buildProgress; + void* userPtr = arguments->userPtr; + + std::atomic progress(0); + + /* initialize temporary arrays for morton builder */ + PrimRef* prims = (PrimRef*) prims_i; + mvector& morton_src = bvh->morton_src; + mvector& morton_tmp = bvh->morton_tmp; + morton_src.resize(primitiveCount); + morton_tmp.resize(primitiveCount); + + /* compute centroid bounds */ + const BBox3fa centBounds = parallel_reduce ( size_t(0), primitiveCount, BBox3fa(empty), [&](const range& r) -> BBox3fa { + + BBox3fa bounds(empty); + for (size_t i=r.begin(); i& r) { + BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton_src[r.begin()]); + for (size_t i=r.begin(); i root = BVHBuilderMorton::build>( + + /* thread local allocator for fast allocations */ + [&] () -> FastAllocator::CachedAllocator { + return bvh->allocator.getCachedAllocator(); + }, + + /* lambda function that allocates BVH nodes */ + [&] ( const FastAllocator::CachedAllocator& alloc, size_t N ) -> void* { + return createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr); + }, + + /* lambda function that sets bounds */ + [&] (void* node, const std::pair* children, size_t N) -> std::pair + { + BBox3fa bounds = empty; + void* childptrs[BVHBuilderMorton::MAX_BRANCHING_FACTOR]; + const RTCBounds* cbounds[BVHBuilderMorton::MAX_BRANCHING_FACTOR]; + for (size_t i=0; i& current, const FastAllocator::CachedAllocator& alloc) -> std::pair + { + RTCBuildPrimitive localBuildPrims[RTC_BUILD_MAX_PRIMITIVES_PER_LEAF]; + BBox3fa bounds = empty; + for (size_t i=0;i BBox3fa { + return prims[morton.index].bounds(); + }, + + /* progress monitor function */ + [&] (size_t dn) { + if (!buildProgress) return true; + const size_t n = progress.fetch_add(dn)+dn; + const double f = std::min(1.0,double(n)/double(primitiveCount)); + return buildProgress(userPtr,f); + }, + + morton_src.data(),morton_tmp.data(),primitiveCount, + *arguments); + + bvh->allocator.cleanup(); + return root.first; + } + + void* rtcBuildBVHBinnedSAH(const RTCBuildArguments* arguments) + { + BVH* bvh = (BVH*) arguments->bvh; + RTCBuildPrimitive* prims = arguments->primitives; + size_t primitiveCount = arguments->primitiveCount; + RTCCreateNodeFunction createNode = arguments->createNode; + RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren; + RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds; + RTCCreateLeafFunction createLeaf = arguments->createLeaf; + RTCProgressMonitorFunction buildProgress = arguments->buildProgress; + void* userPtr = arguments->userPtr; + + std::atomic progress(0); + + /* calculate priminfo */ + auto computeBounds = [&](const range& r) -> CentGeomBBox3fa + { + CentGeomBBox3fa bounds(empty); + for (size_t j=r.begin(); j( + + /* thread local allocator for fast allocations */ + [&] () -> FastAllocator::CachedAllocator { + return bvh->allocator.getCachedAllocator(); + }, + + /* lambda function that creates BVH nodes */ + [&](BVHBuilderBinnedSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void* + { + void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr); + const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR]; + for (size_t i=0; i void* { + setNodeChildren(node,children, (unsigned int)N,userPtr); + return node; + }, + + /* lambda function that creates BVH leaves */ + [&](const PrimRef* prims, const range& range, const FastAllocator::CachedAllocator& alloc) -> void* { + return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr); + }, + + /* progress monitor function */ + [&] (size_t dn) { + if (!buildProgress) return true; + const size_t n = progress.fetch_add(dn)+dn; + const double f = std::min(1.0,double(n)/double(primitiveCount)); + return buildProgress(userPtr,f); + }, + + (PrimRef*)prims,pinfo,*arguments); + + bvh->allocator.cleanup(); + return root; + } + + static __forceinline const std::pair mergePair(const std::pair& a, const std::pair& b) { + CentGeomBBox3fa centBounds = CentGeomBBox3fa::merge2(a.first,b.first); + unsigned int maxGeomID = max(a.second,b.second); + return std::pair(centBounds,maxGeomID); + } + + void* rtcBuildBVHSpatialSAH(const RTCBuildArguments* arguments) + { + BVH* bvh = (BVH*) arguments->bvh; + RTCBuildPrimitive* prims = arguments->primitives; + size_t primitiveCount = arguments->primitiveCount; + RTCCreateNodeFunction createNode = arguments->createNode; + RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren; + RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds; + RTCCreateLeafFunction createLeaf = arguments->createLeaf; + RTCSplitPrimitiveFunction splitPrimitive = arguments->splitPrimitive; + RTCProgressMonitorFunction buildProgress = arguments->buildProgress; + void* userPtr = arguments->userPtr; + + std::atomic progress(0); + + /* calculate priminfo */ + + auto computeBounds = [&](const range& r) -> std::pair + { + CentGeomBBox3fa bounds(empty); + unsigned maxGeomID = 0; + for (size_t j=r.begin(); j(bounds,maxGeomID); + }; + + + const std::pair pair = + parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),std::pair(CentGeomBBox3fa(empty),0), computeBounds, mergePair); + + CentGeomBBox3fa bounds = pair.first; + const unsigned int maxGeomID = pair.second; + + if (unlikely(maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)))) + { + /* fallback code for max geomID larger than threshold */ + return rtcBuildBVHBinnedSAH(arguments); + } + + const PrimInfo pinfo(0,primitiveCount,bounds); + + /* function that splits a build primitive */ + struct Splitter + { + Splitter (RTCSplitPrimitiveFunction splitPrimitive, unsigned geomID, unsigned primID, void* userPtr) + : splitPrimitive(splitPrimitive), geomID(geomID), primID(primID), userPtr(userPtr) {} + + __forceinline void operator() (PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const + { + prim.geomIDref() &= BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK; + splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr); + left_o.geomIDref() = geomID; left_o.primIDref() = primID; + right_o.geomIDref() = geomID; right_o.primIDref() = primID; + } + + __forceinline void operator() (const BBox3fa& box, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const + { + PrimRef prim(box,geomID & BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK,primID); + splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr); + } + + RTCSplitPrimitiveFunction splitPrimitive; + unsigned geomID; + unsigned primID; + void* userPtr; + }; + + /* build BVH */ + void* root = BVHBuilderBinnedFastSpatialSAH::build( + + /* thread local allocator for fast allocations */ + [&] () -> FastAllocator::CachedAllocator { + return bvh->allocator.getCachedAllocator(); + }, + + /* lambda function that creates BVH nodes */ + [&] (BVHBuilderBinnedFastSpatialSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void* + { + void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr); + const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR]; + for (size_t i=0; i void* { + setNodeChildren(node,children, (unsigned int)N,userPtr); + return node; + }, + + /* lambda function that creates BVH leaves */ + [&] (const PrimRef* prims, const range& range, const FastAllocator::CachedAllocator& alloc) -> void* { + return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr); + }, + + /* returns the splitter */ + [&] ( const PrimRef& prim ) -> Splitter { + return Splitter(splitPrimitive,prim.geomID(),prim.primID(),userPtr); + }, + + /* progress monitor function */ + [&] (size_t dn) { + if (!buildProgress) return true; + const size_t n = progress.fetch_add(dn)+dn; + const double f = std::min(1.0,double(n)/double(primitiveCount)); + return buildProgress(userPtr,f); + }, + + (PrimRef*)prims, + arguments->primitiveArrayCapacity, + pinfo,*arguments); + + bvh->allocator.cleanup(); + return root; + } + } +} + +using namespace embree; +using namespace embree::isa; + +RTC_NAMESPACE_BEGIN + + RTC_API RTCBVH rtcNewBVH(RTCDevice device) + { + RTC_CATCH_BEGIN; + RTC_TRACE(rtcNewAllocator); + RTC_VERIFY_HANDLE(device); + BVH* bvh = new BVH((Device*)device); + return (RTCBVH) bvh->refInc(); + RTC_CATCH_END((Device*)device); + return nullptr; + } + + RTC_API void* rtcBuildBVH(const RTCBuildArguments* arguments) + { + BVH* bvh = (BVH*) arguments->bvh; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcBuildBVH); + RTC_VERIFY_HANDLE(bvh); + RTC_VERIFY_HANDLE(arguments); + RTC_VERIFY_HANDLE(arguments->createNode); + RTC_VERIFY_HANDLE(arguments->setNodeChildren); + RTC_VERIFY_HANDLE(arguments->setNodeBounds); + RTC_VERIFY_HANDLE(arguments->createLeaf); + + if (arguments->primitiveArrayCapacity < arguments->primitiveCount) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"primitiveArrayCapacity must be greater or equal to primitiveCount") + + /* initialize the allocator */ + bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa)); + bvh->allocator.reset(); + + /* switch between differnet builders based on quality level */ + if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW) + return rtcBuildBVHMorton(arguments); + else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM) + return rtcBuildBVHBinnedSAH(arguments); + else if (arguments->buildQuality == RTC_BUILD_QUALITY_HIGH) { + if (arguments->splitPrimitive == nullptr || arguments->primitiveArrayCapacity <= arguments->primitiveCount) + return rtcBuildBVHBinnedSAH(arguments); + else + return rtcBuildBVHSpatialSAH(arguments); + } + else + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid build quality"); + + /* if we are in dynamic mode, then do not clear temporary data */ + if (!(arguments->buildFlags & RTC_BUILD_FLAG_DYNAMIC)) + { + bvh->morton_src.clear(); + bvh->morton_tmp.clear(); + } + + RTC_CATCH_END(bvh->device); + return nullptr; + } + + RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator localAllocator, size_t bytes, size_t align) + { + FastAllocator::CachedAllocator* alloc = (FastAllocator::CachedAllocator*) localAllocator; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcThreadLocalAlloc); + return alloc->malloc0(bytes,align); + RTC_CATCH_END(alloc->alloc->getDevice()); + return nullptr; + } + + RTC_API void rtcMakeStaticBVH(RTCBVH hbvh) + { + BVH* bvh = (BVH*) hbvh; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcStaticBVH); + RTC_VERIFY_HANDLE(hbvh); + bvh->morton_src.clear(); + bvh->morton_tmp.clear(); + RTC_CATCH_END(bvh->device); + } + + RTC_API void rtcRetainBVH(RTCBVH hbvh) + { + BVH* bvh = (BVH*) hbvh; + Device* device = bvh ? bvh->device : nullptr; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcRetainBVH); + RTC_VERIFY_HANDLE(hbvh); + bvh->refInc(); + RTC_CATCH_END(device); + } + + RTC_API void rtcReleaseBVH(RTCBVH hbvh) + { + BVH* bvh = (BVH*) hbvh; + Device* device = bvh ? bvh->device : nullptr; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcReleaseBVH); + RTC_VERIFY_HANDLE(hbvh); + bvh->refDec(); + RTC_CATCH_END(device); + } + +RTC_NAMESPACE_END diff --git a/thirdparty/embree-aarch64/kernels/common/scene.cpp b/thirdparty/embree-aarch64/kernels/common/scene.cpp new file mode 100644 index 00000000000..1e23aeb415d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene.cpp @@ -0,0 +1,976 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "scene.h" + +#include "../bvh/bvh4_factory.h" +#include "../bvh/bvh8_factory.h" +#include "../../common/algorithms/parallel_reduce.h" + +namespace embree +{ + /* error raising rtcIntersect and rtcOccluded functions */ + void missing_rtcCommit() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); } + void invalid_rtcIntersect1() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect and rtcOccluded not enabled"); } + void invalid_rtcIntersect4() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect4 and rtcOccluded4 not enabled"); } + void invalid_rtcIntersect8() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect8 and rtcOccluded8 not enabled"); } + void invalid_rtcIntersect16() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect16 and rtcOccluded16 not enabled"); } + void invalid_rtcIntersectN() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectN and rtcOccludedN not enabled"); } + + Scene::Scene (Device* device) + : device(device), + flags_modified(true), enabled_geometry_types(0), + scene_flags(RTC_SCENE_FLAG_NONE), + quality_flags(RTC_BUILD_QUALITY_MEDIUM), + is_build(false), modified(true), + progressInterface(this), progress_monitor_function(nullptr), progress_monitor_ptr(nullptr), progress_monitor_counter(0) + { + device->refInc(); + + intersectors = Accel::Intersectors(missing_rtcCommit); + + /* one can overwrite flags through device for debugging */ + if (device->quality_flags != -1) + quality_flags = (RTCBuildQuality) device->quality_flags; + if (device->scene_flags != -1) + scene_flags = (RTCSceneFlags) device->scene_flags; + } + + Scene::~Scene() noexcept + { + device->refDec(); + } + + void Scene::printStatistics() + { + /* calculate maximum number of time segments */ + unsigned max_time_steps = 0; + for (size_t i=0; inumTimeSteps); + } + + /* initialize vectors*/ + std::vector statistics[Geometry::GTY_END]; + for (size_t i=0; igetType(); + assert(tynumTimeSegments(); + assert((unsigned int)timesegments < max_time_steps); + statistics[ty][timesegments] += get(i)->size(); + } + + /* print statistics */ + std::cout << std::setw(23) << "segments" << ": "; + for (size_t t=0; ttri_accel == "default") + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + { + if (quality_flags == RTC_BUILD_QUALITY_HIGH) + accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); + else + accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + } + else +#endif + { + if (quality_flags == RTC_BUILD_QUALITY_HIGH) + accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); + else + accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + } + break; + + case /*0b01*/ 1: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + else +#endif + accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + + break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else /* dynamic */ + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else +#endif + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + } + } + else if (device->tri_accel == "bvh4.triangle4") accels_add(device->bvh4_factory->BVH4Triangle4 (this)); + else if (device->tri_accel == "bvh4.triangle4v") accels_add(device->bvh4_factory->BVH4Triangle4v(this)); + else if (device->tri_accel == "bvh4.triangle4i") accels_add(device->bvh4_factory->BVH4Triangle4i(this)); + else if (device->tri_accel == "qbvh4.triangle4i") accels_add(device->bvh4_factory->BVH4QuantizedTriangle4i(this)); + +#if defined (EMBREE_TARGET_SIMD8) + else if (device->tri_accel == "bvh8.triangle4") accels_add(device->bvh8_factory->BVH8Triangle4 (this)); + else if (device->tri_accel == "bvh8.triangle4v") accels_add(device->bvh8_factory->BVH8Triangle4v(this)); + else if (device->tri_accel == "bvh8.triangle4i") accels_add(device->bvh8_factory->BVH8Triangle4i(this)); + else if (device->tri_accel == "qbvh8.triangle4i") accels_add(device->bvh8_factory->BVH8QuantizedTriangle4i(this)); + else if (device->tri_accel == "qbvh8.triangle4") accels_add(device->bvh8_factory->BVH8QuantizedTriangle4(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown triangle acceleration structure "+device->tri_accel); +#endif + } + + void Scene::createTriangleMBAccel() + { +#if defined(EMBREE_GEOMETRY_TRIANGLE) + if (device->tri_accel_mb == "default") + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX2()) // BVH8 reduces performance on AVX only-machines + { + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else +#endif + { + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + } + else if (device->tri_accel_mb == "bvh4.triangle4imb") accels_add(device->bvh4_factory->BVH4Triangle4iMB(this)); + else if (device->tri_accel_mb == "bvh4.triangle4vmb") accels_add(device->bvh4_factory->BVH4Triangle4vMB(this)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->tri_accel_mb == "bvh8.triangle4imb") accels_add(device->bvh8_factory->BVH8Triangle4iMB(this)); + else if (device->tri_accel_mb == "bvh8.triangle4vmb") accels_add(device->bvh8_factory->BVH8Triangle4vMB(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur triangle acceleration structure "+device->tri_accel_mb); +#endif + } + + void Scene::createQuadAccel() + { +#if defined(EMBREE_GEOMETRY_QUAD) + if (device->quad_accel == "default") + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) + { + /* static */ + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + { + if (quality_flags == RTC_BUILD_QUALITY_HIGH) + accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); + else + accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + } + else +#endif + { + if (quality_flags == RTC_BUILD_QUALITY_HIGH) + accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST)); + else + accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + } + break; + + case /*0b01*/ 1: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + else +#endif + accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + break; + + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else /* dynamic */ + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else +#endif + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + } + } + else if (device->quad_accel == "bvh4.quad4v") accels_add(device->bvh4_factory->BVH4Quad4v(this)); + else if (device->quad_accel == "bvh4.quad4i") accels_add(device->bvh4_factory->BVH4Quad4i(this)); + else if (device->quad_accel == "qbvh4.quad4i") accels_add(device->bvh4_factory->BVH4QuantizedQuad4i(this)); + +#if defined (EMBREE_TARGET_SIMD8) + else if (device->quad_accel == "bvh8.quad4v") accels_add(device->bvh8_factory->BVH8Quad4v(this)); + else if (device->quad_accel == "bvh8.quad4i") accels_add(device->bvh8_factory->BVH8Quad4i(this)); + else if (device->quad_accel == "qbvh8.quad4i") accels_add(device->bvh8_factory->BVH8QuantizedQuad4i(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad acceleration structure "+device->quad_accel); +#endif + } + + void Scene::createQuadMBAccel() + { +#if defined(EMBREE_GEOMETRY_QUAD) + if (device->quad_accel_mb == "default") + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); + switch (mode) { + case /*0b00*/ 0: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + else +#endif + accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); + break; + + case /*0b01*/ 1: +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX()) + accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + else +#endif + accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); + break; + + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST )); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else if (device->quad_accel_mb == "bvh4.quad4imb") accels_add(device->bvh4_factory->BVH4Quad4iMB(this)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->quad_accel_mb == "bvh8.quad4imb") accels_add(device->bvh8_factory->BVH8Quad4iMB(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad motion blur acceleration structure "+device->quad_accel_mb); +#endif + } + + void Scene::createHairAccel() + { +#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT) + if (device->hair_accel == "default") + { + int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel(); +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX2()) // only enable on HSW machines, for SNB this codepath is slower + { + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST)); break; + case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + else +#endif + { + switch (mode) { + case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST)); break; + case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::ROBUST)); break; + case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST)); break; + case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::ROBUST)); break; + } + } + } + else if (device->hair_accel == "bvh4obb.virtualcurve4v" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST)); + else if (device->hair_accel == "bvh4obb.virtualcurve4i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->hair_accel == "bvh8obb.virtualcurve8v" ) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST)); + else if (device->hair_accel == "bvh4obb.virtualcurve8i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown hair acceleration structure "+device->hair_accel); +#endif + } + + void Scene::createHairMBAccel() + { +#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT) + if (device->hair_accel_mb == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX2()) // only enable on HSW machines, on SNB this codepath is slower + { + if (isRobustAccel()) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::ROBUST)); + else accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST)); + } + else +#endif + { + if (isRobustAccel()) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::ROBUST)); + else accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST)); + } + } + else if (device->hair_accel_mb == "bvh4.virtualcurve4imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST)); + +#if defined (EMBREE_TARGET_SIMD8) + else if (device->hair_accel_mb == "bvh4.virtualcurve8imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST)); + else if (device->hair_accel_mb == "bvh8.virtualcurve8imb") accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur hair acceleration structure "+device->hair_accel_mb); +#endif + } + + void Scene::createSubdivAccel() + { +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + if (device->subdiv_accel == "default") { + accels_add(device->bvh4_factory->BVH4SubdivPatch1(this)); + } + else if (device->subdiv_accel == "bvh4.grid.eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this)); + else if (device->subdiv_accel == "bvh4.subdivpatch1eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this)); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv accel "+device->subdiv_accel); +#endif + } + + void Scene::createSubdivMBAccel() + { +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + if (device->subdiv_accel_mb == "default") { + accels_add(device->bvh4_factory->BVH4SubdivPatch1MB(this)); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv mblur accel "+device->subdiv_accel_mb); +#endif + } + + void Scene::createUserGeometryAccel() + { +#if defined(EMBREE_GEOMETRY_USER) + if (device->object_accel == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC)); + } + } + else +#endif + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC)); + } + } + } + else if (device->object_accel == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometry(this)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->object_accel == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometry(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry accel "+device->object_accel); +#endif + } + + void Scene::createUserGeometryMBAccel() + { +#if defined(EMBREE_GEOMETRY_USER) + if (device->object_accel_mb == "default" ) { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + accels_add(device->bvh8_factory->BVH8UserGeometryMB(this)); + else +#endif + accels_add(device->bvh4_factory->BVH4UserGeometryMB(this)); + } + else if (device->object_accel_mb == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometryMB(this)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->object_accel_mb == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometryMB(this)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry mblur accel "+device->object_accel_mb); +#endif + } + + void Scene::createInstanceAccel() + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + // if (device->object_accel == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::DYNAMIC)); + } + } + else +#endif + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::DYNAMIC)); + } + } + } + // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel); +#endif + } + + void Scene::createInstanceMBAccel() + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + //if (device->instance_accel_mb == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + accels_add(device->bvh8_factory->BVH8InstanceMB(this, false)); + else +#endif + accels_add(device->bvh4_factory->BVH4InstanceMB(this, false)); + } + //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb); +#endif + } + + void Scene::createInstanceExpensiveAccel() + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + // if (device->object_accel == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::DYNAMIC)); + } + } + else +#endif + { + if (quality_flags != RTC_BUILD_QUALITY_LOW) { + accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::STATIC)); + } else { + accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::DYNAMIC)); + } + } + } + // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel); +#endif + } + + void Scene::createInstanceExpensiveMBAccel() + { +#if defined(EMBREE_GEOMETRY_INSTANCE) + //if (device->instance_accel_mb == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + accels_add(device->bvh8_factory->BVH8InstanceMB(this, true)); + else +#endif + accels_add(device->bvh4_factory->BVH4InstanceMB(this, true)); + } + //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb); +#endif + } + + void Scene::createGridAccel() + { + BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST; +#if defined(EMBREE_GEOMETRY_GRID) + if (device->grid_accel == "default") + { +#if defined (EMBREE_TARGET_SIMD8) + if (device->canUseAVX() && !isCompactAccel()) + { + accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); + } + else +#endif + { + accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); + } + } + else if (device->grid_accel == "bvh4.grid") accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); +#if defined (EMBREE_TARGET_SIMD8) + else if (device->grid_accel == "bvh8.grid") accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant)); +#endif + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid accel "+device->grid_accel); +#endif + + } + + void Scene::createGridMBAccel() + { +#if defined(EMBREE_GEOMETRY_GRID) + if (device->grid_accel_mb == "default") + { + accels_add(device->bvh4_factory->BVH4GridMB(this,BVHFactory::BuildVariant::STATIC)); + } + else if (device->grid_accel_mb == "bvh4mb.grid") accels_add(device->bvh4_factory->BVH4GridMB(this)); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid mb accel "+device->grid_accel); +#endif + + } + + void Scene::clear() { + } + + unsigned Scene::bind(unsigned geomID, Ref geometry) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(geometriesMutex); +#else + Lock lock(geometriesMutex); +#endif + if (geomID == RTC_INVALID_GEOMETRY_ID) { + geomID = id_pool.allocate(); + if (geomID == RTC_INVALID_GEOMETRY_ID) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"too many geometries inside scene"); + } + else + { + if (!id_pool.add(geomID)) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID provided"); + } + if (geomID >= geometries.size()) { + geometries.resize(geomID+1); + vertices.resize(geomID+1); + geometryModCounters_.resize(geomID+1); + } + geometries[geomID] = geometry; + geometryModCounters_[geomID] = 0; + if (geometry->isEnabled()) { + setModified (); + } + return geomID; + } + + void Scene::detachGeometry(size_t geomID) + { +#if defined(__aarch64__) && defined(BUILD_IOS) + std::scoped_lock lock(geometriesMutex); +#else + Lock lock(geometriesMutex); +#endif + + if (geomID >= geometries.size()) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID"); + + Ref& geometry = geometries[geomID]; + if (geometry == null) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry"); + + if (geometry->isEnabled()) { + setModified (); + } + accels_deleteGeometry(unsigned(geomID)); + id_pool.deallocate((unsigned)geomID); + geometries[geomID] = null; + vertices[geomID] = nullptr; + geometryModCounters_[geomID] = 0; + } + + void Scene::updateInterface() + { + is_build = true; + } + + void Scene::commit_task () + { + checkIfModifiedAndSet (); + if (!isModified()) { + return; + } + + /* print scene statistics */ + if (device->verbosity(2)) + printStatistics(); + + progress_monitor_counter = 0; + + /* gather scene stats and call preCommit function of each geometry */ + this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (), + [this](const range& r)->GeometryCounts + { + GeometryCounts c; + for (auto i=r.begin(); iisEnabled()) + { + geometries[i]->preCommit(); + geometries[i]->addElementsToCount (c); + c.numFilterFunctions += (int) geometries[i]->hasFilterFunctions(); + } + } + return c; + }, + std::plus() + ); + + /* select acceleration structures to build */ + unsigned int new_enabled_geometry_types = world.enabledGeometryTypesMask(); + if (flags_modified || new_enabled_geometry_types != enabled_geometry_types) + { + accels_init(); + + /* we need to make all geometries modified, otherwise two level builder will + not rebuild currently not modified geometries */ + parallel_for(geometryModCounters_.size(), [&] ( const size_t i ) { + geometryModCounters_[i] = 0; + }); + + if (getNumPrimitives(TriangleMesh::geom_type,false)) createTriangleAccel(); + if (getNumPrimitives(TriangleMesh::geom_type,true)) createTriangleMBAccel(); + if (getNumPrimitives(QuadMesh::geom_type,false)) createQuadAccel(); + if (getNumPrimitives(QuadMesh::geom_type,true)) createQuadMBAccel(); + if (getNumPrimitives(GridMesh::geom_type,false)) createGridAccel(); + if (getNumPrimitives(GridMesh::geom_type,true)) createGridMBAccel(); + if (getNumPrimitives(SubdivMesh::geom_type,false)) createSubdivAccel(); + if (getNumPrimitives(SubdivMesh::geom_type,true)) createSubdivMBAccel(); + if (getNumPrimitives(Geometry::MTY_CURVES,false)) createHairAccel(); + if (getNumPrimitives(Geometry::MTY_CURVES,true)) createHairMBAccel(); + if (getNumPrimitives(UserGeometry::geom_type,false)) createUserGeometryAccel(); + if (getNumPrimitives(UserGeometry::geom_type,true)) createUserGeometryMBAccel(); + if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,false)) createInstanceAccel(); + if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,true)) createInstanceMBAccel(); + if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,false)) createInstanceExpensiveAccel(); + if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,true)) createInstanceExpensiveMBAccel(); + + flags_modified = false; + enabled_geometry_types = new_enabled_geometry_types; + } + + /* select fast code path if no filter function is present */ + accels_select(hasFilterFunction()); + + /* build all hierarchies of this scene */ + accels_build(); + + /* make static geometry immutable */ + if (!isDynamicAccel()) { + accels_immutable(); + flags_modified = true; // in non-dynamic mode we have to re-create accels + } + + /* call postCommit function of each geometry */ + parallel_for(geometries.size(), [&] ( const size_t i ) { + if (geometries[i] && geometries[i]->isEnabled()) { + geometries[i]->postCommit(); + vertices[i] = geometries[i]->getCompactVertexArray(); + geometryModCounters_[i] = geometries[i]->getModCounter(); + } + }); + + updateInterface(); + + if (device->verbosity(2)) { + std::cout << "created scene intersector" << std::endl; + accels_print(2); + std::cout << "selected scene intersector" << std::endl; + intersectors.print(2); + } + + setModified(false); + } + + void Scene::setBuildQuality(RTCBuildQuality quality_flags_i) + { + if (quality_flags == quality_flags_i) return; + quality_flags = quality_flags_i; + flags_modified = true; + } + + RTCBuildQuality Scene::getBuildQuality() const { + return quality_flags; + } + + void Scene::setSceneFlags(RTCSceneFlags scene_flags_i) + { + if (scene_flags == scene_flags_i) return; + scene_flags = scene_flags_i; + flags_modified = true; + } + + RTCSceneFlags Scene::getSceneFlags() const { + return scene_flags; + } + +#if defined(TASKING_INTERNAL) + + void Scene::commit (bool join) + { + Lock buildLock(buildMutex,false); + + /* allocates own taskscheduler for each build */ + Ref scheduler = nullptr; + { + Lock lock(schedulerMutex); + scheduler = this->scheduler; + if (scheduler == null) { + buildLock.lock(); + this->scheduler = scheduler = new TaskScheduler; + } + } + + /* worker threads join build */ + if (!buildLock.isLocked()) + { + if (!join) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"use rtcJoinCommitScene to join a build operation"); + + scheduler->join(); + return; + } + + /* initiate build */ + // -- GODOT start -- + // try { + scheduler->spawn_root([&]() { commit_task(); Lock lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join); + // } + // catch (...) { + // accels_clear(); + // updateInterface(); + // Lock lock(schedulerMutex); + // this->scheduler = nullptr; + // throw; + // } + // -- GODOT end -- + } + +#endif + +#if defined(TASKING_TBB) || defined(TASKING_GCD) + + void Scene::commit (bool join) + { +#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8) + if (join) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with this TBB version"); +#endif + + /* try to obtain build lock */ + Lock lock(buildMutex,buildMutex.try_lock()); + + /* join hierarchy build */ + if (!lock.isLocked()) + { +#if !TASKING_TBB_USE_TASK_ISOLATION + if (!join) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invoking rtcCommitScene from multiple threads is not supported with this TBB version"); +#endif + + do { + +#if defined(TASKING_GCD) + // Do Nothing +#else +#if USE_TASK_ARENA + if (join) { + device->arena->execute([&]{ group.wait(); }); + } + else +#endif + { + group.wait(); + } +#endif + + pause_cpu(); + yield(); + + } while (!buildMutex.try_lock()); + + buildMutex.unlock(); + return; + } + + /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */ + const unsigned int mxcsr = _mm_getcsr(); + _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6)); + + try { +#if defined(TASKING_TBB) +#if TBB_INTERFACE_VERSION_MAJOR < 8 + tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits); +#else + tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings ); +#endif + //ctx.set_priority(tbb::priority_high); + +#if USE_TASK_ARENA + if (join) + { + device->arena->execute([&]{ + group.run([&]{ + tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx); + }); + group.wait(); + }); + } + else +#endif + { + group.run([&]{ + tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx); + }); + group.wait(); + } + + /* reset MXCSR register again */ + _mm_setcsr(mxcsr); + +#elif defined(TASKING_GCD) + + commit_task(); + +#endif // #if defined(TASKING_TBB) + + } + catch (...) + { + /* reset MXCSR register again */ + _mm_setcsr(mxcsr); + + accels_clear(); + updateInterface(); + throw; + } + } +#endif + +#if defined(TASKING_PPL) + + void Scene::commit (bool join) + { +#if defined(TASKING_PPL) + if (join) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with PPL"); +#endif + + /* try to obtain build lock */ + Lock lock(buildMutex); + + checkIfModifiedAndSet (); + if (!isModified()) { + return; + } + + /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */ + const unsigned int mxcsr = _mm_getcsr(); + _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6)); + + try { + + group.run([&]{ + concurrency::parallel_for(size_t(0), size_t(1), size_t(1), [&](size_t) { commit_task(); }); + }); + group.wait(); + + /* reset MXCSR register again */ + _mm_setcsr(mxcsr); + } + catch (...) + { + /* reset MXCSR register again */ + _mm_setcsr(mxcsr); + + accels_clear(); + updateInterface(); + throw; + } + } +#endif + + void Scene::setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr) + { + progress_monitor_function = func; + progress_monitor_ptr = ptr; + } + + void Scene::progressMonitor(double dn) + { + if (progress_monitor_function) { + size_t n = size_t(dn) + progress_monitor_counter.fetch_add(size_t(dn)); + if (!progress_monitor_function(progress_monitor_ptr, n / (double(numPrimitives())))) { + throw_RTCError(RTC_ERROR_CANCELLED,"progress monitor forced termination"); + } + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene.h b/thirdparty/embree-aarch64/kernels/common/scene.h new file mode 100644 index 00000000000..b41c6cde913 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene.h @@ -0,0 +1,390 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "device.h" +#include "builder.h" +#include "../../common/algorithms/parallel_any_of.h" +#include "scene_triangle_mesh.h" +#include "scene_quad_mesh.h" +#include "scene_user_geometry.h" +#include "scene_instance.h" +#include "scene_curves.h" +#include "scene_line_segments.h" +#include "scene_subdiv_mesh.h" +#include "scene_grid_mesh.h" +#include "scene_points.h" +#include "../subdiv/tessellation_cache.h" + +#include "acceln.h" +#include "geometry.h" + +namespace embree +{ + /*! Base class all scenes are derived from */ + class Scene : public AccelN + { + ALIGNED_CLASS_(std::alignment_of::value); + + public: + template + class Iterator + { + public: + Iterator () {} + + Iterator (Scene* scene, bool all = false) + : scene(scene), all(all) {} + + __forceinline Ty* at(const size_t i) + { + Geometry* geom = scene->geometries[i].ptr; + if (geom == nullptr) return nullptr; + if (!all && !geom->isEnabled()) return nullptr; + const size_t mask = geom->getTypeMask() & Ty::geom_type; + if (!(mask)) return nullptr; + if ((geom->numTimeSteps != 1) != mblur) return nullptr; + return (Ty*) geom; + } + + __forceinline Ty* operator[] (const size_t i) { + return at(i); + } + + __forceinline size_t size() const { + return scene->size(); + } + + __forceinline size_t numPrimitives() const { + return scene->getNumPrimitives(Ty::geom_type,mblur); + } + + __forceinline size_t maxPrimitivesPerGeometry() + { + size_t ret = 0; + for (size_t i=0; isize(); i++) { + Ty* mesh = at(i); + if (mesh == nullptr) continue; + ret = max(ret,mesh->size()); + } + return ret; + } + + __forceinline unsigned int maxGeomID() + { + unsigned int ret = 0; + for (size_t i=0; isize(); i++) { + Ty* mesh = at(i); + if (mesh == nullptr) continue; + ret = max(ret,(unsigned int)i); + } + return ret; + } + + __forceinline unsigned maxTimeStepsPerGeometry() + { + unsigned ret = 0; + for (size_t i=0; isize(); i++) { + Ty* mesh = at(i); + if (mesh == nullptr) continue; + ret = max(ret,mesh->numTimeSteps); + } + return ret; + } + + private: + Scene* scene; + bool all; + }; + + class Iterator2 + { + public: + Iterator2 () {} + + Iterator2 (Scene* scene, Geometry::GTypeMask typemask, bool mblur) + : scene(scene), typemask(typemask), mblur(mblur) {} + + __forceinline Geometry* at(const size_t i) + { + Geometry* geom = scene->geometries[i].ptr; + if (geom == nullptr) return nullptr; + if (!geom->isEnabled()) return nullptr; + if (!(geom->getTypeMask() & typemask)) return nullptr; + if ((geom->numTimeSteps != 1) != mblur) return nullptr; + return geom; + } + + __forceinline Geometry* operator[] (const size_t i) { + return at(i); + } + + __forceinline size_t size() const { + return scene->size(); + } + + private: + Scene* scene; + Geometry::GTypeMask typemask; + bool mblur; + }; + + public: + + /*! Scene construction */ + Scene (Device* device); + + /*! Scene destruction */ + ~Scene () noexcept; + + private: + /*! class is non-copyable */ + Scene (const Scene& other) DELETED; // do not implement + Scene& operator= (const Scene& other) DELETED; // do not implement + + public: + void createTriangleAccel(); + void createTriangleMBAccel(); + void createQuadAccel(); + void createQuadMBAccel(); + void createHairAccel(); + void createHairMBAccel(); + void createSubdivAccel(); + void createSubdivMBAccel(); + void createUserGeometryAccel(); + void createUserGeometryMBAccel(); + void createInstanceAccel(); + void createInstanceMBAccel(); + void createInstanceExpensiveAccel(); + void createInstanceExpensiveMBAccel(); + void createGridAccel(); + void createGridMBAccel(); + + /*! prints statistics about the scene */ + void printStatistics(); + + /*! clears the scene */ + void clear(); + + /*! detaches some geometry */ + void detachGeometry(size_t geomID); + + void setBuildQuality(RTCBuildQuality quality_flags); + RTCBuildQuality getBuildQuality() const; + + void setSceneFlags(RTCSceneFlags scene_flags); + RTCSceneFlags getSceneFlags() const; + + void commit (bool join); + void commit_task (); + void build () {} + + void updateInterface(); + + /* return number of geometries */ + __forceinline size_t size() const { return geometries.size(); } + + /* bind geometry to the scene */ + unsigned int bind (unsigned geomID, Ref geometry); + + /* determines if scene is modified */ + __forceinline bool isModified() const { return modified; } + + /* sets modified flag */ + __forceinline void setModified(bool f = true) { + modified = f; + } + + __forceinline bool isGeometryModified(size_t geomID) + { + Ref& g = geometries[geomID]; + if (!g) return false; + return g->getModCounter() > geometryModCounters_[geomID]; + } + + protected: + + __forceinline void checkIfModifiedAndSet () + { + if (isModified ()) return; + + auto geometryIsModified = [this](size_t geomID)->bool { + return isGeometryModified(geomID); + }; + + if (parallel_any_of (size_t(0), geometries.size (), geometryIsModified)) { + setModified (); + } + } + + public: + + /* get mesh by ID */ + __forceinline Geometry* get(size_t i) { assert(i < geometries.size()); return geometries[i].ptr; } + __forceinline const Geometry* get(size_t i) const { assert(i < geometries.size()); return geometries[i].ptr; } + + template + __forceinline Mesh* get(size_t i) { + assert(i < geometries.size()); + assert(geometries[i]->getTypeMask() & Mesh::geom_type); + return (Mesh*)geometries[i].ptr; + } + template + __forceinline const Mesh* get(size_t i) const { + assert(i < geometries.size()); + assert(geometries[i]->getTypeMask() & Mesh::geom_type); + return (Mesh*)geometries[i].ptr; + } + + template + __forceinline Mesh* getSafe(size_t i) { + assert(i < geometries.size()); + if (geometries[i] == null) return nullptr; + if (!(geometries[i]->getTypeMask() & Mesh::geom_type)) return nullptr; + else return (Mesh*) geometries[i].ptr; + } + + __forceinline Ref get_locked(size_t i) { + Lock lock(geometriesMutex); + assert(i < geometries.size()); + return geometries[i]; + } + + /* flag decoding */ + __forceinline bool isFastAccel() const { return !isCompactAccel() && !isRobustAccel(); } + __forceinline bool isCompactAccel() const { return scene_flags & RTC_SCENE_FLAG_COMPACT; } + __forceinline bool isRobustAccel() const { return scene_flags & RTC_SCENE_FLAG_ROBUST; } + __forceinline bool isStaticAccel() const { return !(scene_flags & RTC_SCENE_FLAG_DYNAMIC); } + __forceinline bool isDynamicAccel() const { return scene_flags & RTC_SCENE_FLAG_DYNAMIC; } + + __forceinline bool hasContextFilterFunction() const { + return scene_flags & RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION; + } + + __forceinline bool hasGeometryFilterFunction() { + return world.numFilterFunctions != 0; + } + + __forceinline bool hasFilterFunction() { + return hasContextFilterFunction() || hasGeometryFilterFunction(); + } + + /* test if scene got already build */ + __forceinline bool isBuild() const { return is_build; } + + public: + IDPool id_pool; + vector> geometries; //!< list of all user geometries + vector geometryModCounters_; + vector vertices; + + public: + Device* device; + + /* these are to detect if we need to recreate the acceleration structures */ + bool flags_modified; + unsigned int enabled_geometry_types; + + RTCSceneFlags scene_flags; + RTCBuildQuality quality_flags; + MutexSys buildMutex; + SpinLock geometriesMutex; + bool is_build; + private: + bool modified; //!< true if scene got modified + + public: + + /*! global lock step task scheduler */ +#if defined(TASKING_INTERNAL) + MutexSys schedulerMutex; + Ref scheduler; +#elif defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION + tbb::isolated_task_group group; +#elif defined(TASKING_TBB) + tbb::task_group group; +#elif defined(TASKING_PPL) + concurrency::task_group group; +#endif + + public: + struct BuildProgressMonitorInterface : public BuildProgressMonitor { + BuildProgressMonitorInterface(Scene* scene) + : scene(scene) {} + void operator() (size_t dn) const { scene->progressMonitor(double(dn)); } + private: + Scene* scene; + }; + BuildProgressMonitorInterface progressInterface; + RTCProgressMonitorFunction progress_monitor_function; + void* progress_monitor_ptr; + std::atomic progress_monitor_counter; + void progressMonitor(double nprims); + void setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr); + + private: + GeometryCounts world; //!< counts for geometry + + public: + + __forceinline size_t numPrimitives() const { + return world.size(); + } + + __forceinline size_t getNumPrimitives(Geometry::GTypeMask mask, bool mblur) const + { + size_t count = 0; + + if (mask & Geometry::MTY_TRIANGLE_MESH) + count += mblur ? world.numMBTriangles : world.numTriangles; + + if (mask & Geometry::MTY_QUAD_MESH) + count += mblur ? world.numMBQuads : world.numQuads; + + if (mask & Geometry::MTY_CURVE2) + count += mblur ? world.numMBLineSegments : world.numLineSegments; + + if (mask & Geometry::MTY_CURVE4) + count += mblur ? world.numMBBezierCurves : world.numBezierCurves; + + if (mask & Geometry::MTY_POINTS) + count += mblur ? world.numMBPoints : world.numPoints; + + if (mask & Geometry::MTY_SUBDIV_MESH) + count += mblur ? world.numMBSubdivPatches : world.numSubdivPatches; + + if (mask & Geometry::MTY_USER_GEOMETRY) + count += mblur ? world.numMBUserGeometries : world.numUserGeometries; + + if (mask & Geometry::MTY_INSTANCE_CHEAP) + count += mblur ? world.numMBInstancesCheap : world.numInstancesCheap; + + if (mask & Geometry::MTY_INSTANCE_EXPENSIVE) + count += mblur ? world.numMBInstancesExpensive : world.numInstancesExpensive; + + if (mask & Geometry::MTY_GRID_MESH) + count += mblur ? world.numMBGrids : world.numGrids; + + return count; + } + + template + __forceinline unsigned getNumTimeSteps() + { + if (!mblur) + return 1; + + Scene::Iterator iter(this); + return iter.maxTimeStepsPerGeometry(); + } + + template + __forceinline unsigned int getMaxGeomID() + { + Scene::Iterator iter(this); + return iter.maxGeomID(); + } + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_curves.h b/thirdparty/embree-aarch64/kernels/common/scene_curves.h new file mode 100644 index 00000000000..2649ab0e3e8 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_curves.h @@ -0,0 +1,341 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "geometry.h" +#include "buffer.h" + +namespace embree +{ + /*! represents an array of bicubic bezier curves */ + struct CurveGeometry : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE4; + + public: + + /*! bezier curve construction */ + CurveGeometry (Device* device, Geometry::GType gtype); + + public: + void setMask(unsigned mask); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void setTessellationRate(float N); + void setMaxRadiusScale(float s); + void addElementsToCount (GeometryCounts & counts) const; + + public: + + /*! returns the number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns the i'th curve */ + __forceinline const unsigned int& curve(size_t i) const { + return curves[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline Vec3ff vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th normal of the first time step */ + __forceinline Vec3fa normal(size_t i) const { + return normals0[i]; + } + + /*! returns i'th tangent of the first time step */ + __forceinline Vec3ff tangent(size_t i) const { + return tangents0[i]; + } + + /*! returns i'th normal derivative of the first time step */ + __forceinline Vec3fa dnormal(size_t i) const { + return dnormals0[i]; + } + + /*! returns i'th radius of the first time step */ + __forceinline float radius(size_t i) const { + return vertices0[i].w; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline Vec3ff vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th normal of itime'th timestep */ + __forceinline Vec3fa normal(size_t i, size_t itime) const { + return normals[itime][i]; + } + + /*! returns i'th tangent of itime'th timestep */ + __forceinline Vec3ff tangent(size_t i, size_t itime) const { + return tangents[itime][i]; + } + + /*! returns i'th normal derivative of itime'th timestep */ + __forceinline Vec3fa dnormal(size_t i, size_t itime) const { + return dnormals[itime][i]; + } + + /*! returns i'th radius of itime'th timestep */ + __forceinline float radius(size_t i, size_t itime) const { + return vertices[itime][i].w; + } + + /*! gathers the curve starting with i'th vertex */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i) const + { + p0 = vertex(i+0); + p1 = vertex(i+1); + p2 = vertex(i+2); + p3 = vertex(i+3); + } + + /*! gathers the curve starting with i'th vertex of itime'th timestep */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, size_t itime) const + { + p0 = vertex(i+0,itime); + p1 = vertex(i+1,itime); + p2 = vertex(i+2,itime); + p3 = vertex(i+3,itime); + } + + /*! gathers the curve starting with i'th vertex */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i) const + { + p0 = vertex(i+0); + p1 = vertex(i+1); + p2 = vertex(i+2); + p3 = vertex(i+3); + n0 = normal(i+0); + n1 = normal(i+1); + n2 = normal(i+2); + n3 = normal(i+3); + } + + /*! gathers the curve starting with i'th vertex of itime'th timestep */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, size_t itime) const + { + p0 = vertex(i+0,itime); + p1 = vertex(i+1,itime); + p2 = vertex(i+2,itime); + p3 = vertex(i+3,itime); + n0 = normal(i+0,itime); + n1 = normal(i+1,itime); + n2 = normal(i+2,itime); + n3 = normal(i+3,itime); + } + + /*! prefetches the curve starting with i'th vertex of itime'th timestep */ + __forceinline void prefetchL1_vertices(size_t i) const + { + prefetchL1(vertices0.getPtr(i)+0); + prefetchL1(vertices0.getPtr(i)+64); + } + + /*! prefetches the curve starting with i'th vertex of itime'th timestep */ + __forceinline void prefetchL2_vertices(size_t i) const + { + prefetchL2(vertices0.getPtr(i)+0); + prefetchL2(vertices0.getPtr(i)+64); + } + + /*! loads curve vertices for specified time */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + + const float t0 = 1.0f - ftime; + const float t1 = ftime; + Vec3ff a0,a1,a2,a3; + gather(a0,a1,a2,a3,i,itime); + Vec3ff b0,b1,b2,b3; + gather(b0,b1,b2,b3,i,itime+1); + p0 = madd(Vec3ff(t0),a0,t1*b0); + p1 = madd(Vec3ff(t0),a1,t1*b1); + p2 = madd(Vec3ff(t0),a2,t1*b2); + p3 = madd(Vec3ff(t0),a3,t1*b3); + } + + /*! loads curve vertices for specified time */ + __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + + const float t0 = 1.0f - ftime; + const float t1 = ftime; + Vec3ff a0,a1,a2,a3; Vec3fa an0,an1,an2,an3; + gather(a0,a1,a2,a3,an0,an1,an2,an3,i,itime); + Vec3ff b0,b1,b2,b3; Vec3fa bn0,bn1,bn2,bn3; + gather(b0,b1,b2,b3,bn0,bn1,bn2,bn3,i,itime+1); + p0 = madd(Vec3ff(t0),a0,t1*b0); + p1 = madd(Vec3ff(t0),a1,t1*b1); + p2 = madd(Vec3ff(t0),a2,t1*b2); + p3 = madd(Vec3ff(t0),a3,t1*b3); + n0 = madd(Vec3ff(t0),an0,t1*bn0); + n1 = madd(Vec3ff(t0),an1,t1*bn1); + n2 = madd(Vec3ff(t0),an2,t1*bn2); + n3 = madd(Vec3ff(t0),an3,t1*bn3); + } + + template + __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const + { + Vec3ff v0,v1,v2,v3; Vec3fa n0,n1,n2,n3; + unsigned int vertexID = curve(primID); + gather(v0,v1,v2,v3,n0,n1,n2,n3,vertexID,itime); + SourceCurve3ff ccurve(v0,v1,v2,v3); + SourceCurve3fa ncurve(n0,n1,n2,n3); + ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve); + return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve); + } + + template + __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedCurve(context,ray_org,primID,itime+0); + const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedCurve(context,ray_org,primID,itime+1); + return clerp(curve0,curve1,ftime); + } + + /*! gathers the hermite curve starting with i'th vertex */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i) const + { + p0 = vertex (i+0); + p1 = vertex (i+1); + t0 = tangent(i+0); + t1 = tangent(i+1); + } + + /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, size_t itime) const + { + p0 = vertex (i+0,itime); + p1 = vertex (i+1,itime); + t0 = tangent(i+0,itime); + t1 = tangent(i+1,itime); + } + + /*! loads curve vertices for specified time */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + const float f0 = 1.0f - ftime, f1 = ftime; + Vec3ff ap0,at0,ap1,at1; + gather_hermite(ap0,at0,ap1,at1,i,itime); + Vec3ff bp0,bt0,bp1,bt1; + gather_hermite(bp0,bt0,bp1,bt1,i,itime+1); + p0 = madd(Vec3ff(f0),ap0,f1*bp0); + t0 = madd(Vec3ff(f0),at0,f1*bt0); + p1 = madd(Vec3ff(f0),ap1,f1*bp1); + t1 = madd(Vec3ff(f0),at1,f1*bt1); + } + + /*! gathers the hermite curve starting with i'th vertex */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i) const + { + p0 = vertex (i+0); + p1 = vertex (i+1); + t0 = tangent(i+0); + t1 = tangent(i+1); + n0 = normal(i+0); + n1 = normal(i+1); + dn0 = dnormal(i+0); + dn1 = dnormal(i+1); + } + + /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, size_t itime) const + { + p0 = vertex (i+0,itime); + p1 = vertex (i+1,itime); + t0 = tangent(i+0,itime); + t1 = tangent(i+1,itime); + n0 = normal(i+0,itime); + n1 = normal(i+1,itime); + dn0 = dnormal(i+0,itime); + dn1 = dnormal(i+1,itime); + } + + /*! loads curve vertices for specified time */ + __forceinline void gather_hermite(Vec3ff& p0, Vec3fa& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3fa& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + const float f0 = 1.0f - ftime, f1 = ftime; + Vec3ff ap0,at0,ap1,at1; Vec3fa an0,adn0,an1,adn1; + gather_hermite(ap0,at0,an0,adn0,ap1,at1,an1,adn1,i,itime); + Vec3ff bp0,bt0,bp1,bt1; Vec3fa bn0,bdn0,bn1,bdn1; + gather_hermite(bp0,bt0,bn0,bdn0,bp1,bt1,bn1,bdn1,i,itime+1); + p0 = madd(Vec3ff(f0),ap0,f1*bp0); + t0 = madd(Vec3ff(f0),at0,f1*bt0); + n0 = madd(Vec3ff(f0),an0,f1*bn0); + dn0= madd(Vec3ff(f0),adn0,f1*bdn0); + p1 = madd(Vec3ff(f0),ap1,f1*bp1); + t1 = madd(Vec3ff(f0),at1,f1*bt1); + n1 = madd(Vec3ff(f0),an1,f1*bn1); + dn1= madd(Vec3ff(f0),adn1,f1*bdn1); + } + + template + __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const + { + Vec3ff v0,t0,v1,t1; Vec3fa n0,dn0,n1,dn1; + unsigned int vertexID = curve(primID); + gather_hermite(v0,t0,n0,dn0,v1,t1,n1,dn1,vertexID,itime); + + SourceCurve3ff ccurve(v0,t0,v1,t1); + SourceCurve3fa ncurve(n0,dn0,n1,dn1); + ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve); + return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve); + } + + template + __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const + { + float ftime; + const size_t itime = timeSegment(time, ftime); + const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedHermiteCurve(context, ray_org, primID,itime+0); + const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedHermiteCurve(context, ray_org, primID,itime+1); + return clerp(curve0,curve1,ftime); + } + + private: + void resizeBuffers(unsigned int numSteps); + + public: + BufferView curves; //!< array of curve indices + BufferView vertices0; //!< fast access to first vertex buffer + BufferView normals0; //!< fast access to first normal buffer + BufferView tangents0; //!< fast access to first tangent buffer + BufferView dnormals0; //!< fast access to first normal derivative buffer + vector> vertices; //!< vertex array for each timestep + vector> normals; //!< normal array for each timestep + vector> tangents; //!< tangent array for each timestep + vector> dnormals; //!< normal derivative array for each timestep + BufferView flags; //!< start, end flag per segment + vector> vertexAttribs; //!< user buffers + int tessellationRate; //!< tessellation rate for flat curve + float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii + }; + + DECLARE_ISA_FUNCTION(CurveGeometry*, createCurves, Device* COMMA Geometry::GType); +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h new file mode 100644 index 00000000000..c08658466a2 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h @@ -0,0 +1,215 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "geometry.h" +#include "buffer.h" + +namespace embree +{ + /*! Grid Mesh */ + struct GridMesh : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_GRID_MESH; + + /*! grid */ + struct Grid + { + unsigned int startVtxID; + unsigned int lineVtxOffset; + unsigned short resX,resY; + + /* border flags due to 3x3 vertex pattern */ + __forceinline unsigned int get3x3FlagsX(const unsigned int x) const + { + return (x + 2 >= (unsigned int)resX) ? (1<<15) : 0; + } + + /* border flags due to 3x3 vertex pattern */ + __forceinline unsigned int get3x3FlagsY(const unsigned int y) const + { + return (y + 2 >= (unsigned int)resY) ? (1<<15) : 0; + } + + /*! outputs grid structure */ + __forceinline friend embree_ostream operator<<(embree_ostream cout, const Grid& t) { + return cout << "Grid { startVtxID " << t.startVtxID << ", lineVtxOffset " << t.lineVtxOffset << ", resX " << t.resX << ", resY " << t.resY << " }"; + } + }; + + public: + + /*! grid mesh construction */ + GridMesh (Device* device); + + /* geometry interface */ + public: + void setMask(unsigned mask); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void interpolate(const RTCInterpolateArguments* const args); + void addElementsToCount (GeometryCounts & counts) const; + + __forceinline unsigned int getNumSubGrids(const size_t gridID) + { + const Grid &g = grid(gridID); + return max((unsigned int)1,((unsigned int)g.resX >> 1) * ((unsigned int)g.resY >> 1)); + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + public: + + /*! returns number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns i'th grid*/ + __forceinline const Grid& grid(size_t i) const { + return grids[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const Vec3fa vertex(size_t i) const { // FIXME: check if this does a unaligned load + return vertices0[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const Vec3fa vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! returns i'th vertex of the first timestep */ + __forceinline size_t grid_vertex_index(const Grid& g, size_t x, size_t y) const { + assert(x < (size_t)g.resX); + assert(y < (size_t)g.resY); + return g.startVtxID + x + y * g.lineVtxOffset; + } + + /*! returns i'th vertex of the first timestep */ + __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y) const { + const size_t index = grid_vertex_index(g,x,y); + return vertex(index); + } + + /*! returns i'th vertex of the itime'th timestep */ + __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y, size_t itime) const { + const size_t index = grid_vertex_index(g,x,y); + return vertex(index,itime); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, BBox3fa& bbox) const + { + BBox3fa b(empty); + for (size_t t=0; t& itime_range) const + { + if (unlikely(gridID >= grids.size())) return false; + const Grid &g = grid(gridID); + if (unlikely(g.startVtxID + 0 >= vertices0.size())) return false; + if (unlikely(g.startVtxID + (g.resY-1)*g.lineVtxOffset + g.resX-1 >= vertices0.size())) return false; + + for (size_t y=0;y grids; //!< array of triangles + BufferView vertices0; //!< fast access to first vertex buffer + vector> vertices; //!< vertex array for each timestep + vector vertexAttribs; //!< vertex attributes + }; + + namespace isa + { + struct GridMeshISA : public GridMesh + { + GridMeshISA (Device* device) + : GridMesh(device) {} + }; + } + + DECLARE_ISA_FUNCTION(GridMesh*, createGridMesh, Device*); +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_instance.h b/thirdparty/embree-aarch64/kernels/common/scene_instance.h new file mode 100644 index 00000000000..7ff82a4fb8d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_instance.h @@ -0,0 +1,272 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "geometry.h" +#include "accel.h" + +namespace embree +{ + struct MotionDerivativeCoefficients; + + /*! Instanced acceleration structure */ + struct Instance : public Geometry + { + ALIGNED_STRUCT_(16); + static const Geometry::GTypeMask geom_type = Geometry::MTY_INSTANCE; + + public: + Instance (Device* device, Accel* object = nullptr, unsigned int numTimeSteps = 1); + ~Instance(); + + private: + Instance (const Instance& other) DELETED; // do not implement + Instance& operator= (const Instance& other) DELETED; // do not implement + + private: + LBBox3fa nonlinearBounds(const BBox1f& time_range_in, + const BBox1f& geom_time_range, + float geom_time_segments) const; + + BBox3fa boundSegment(size_t itime, + BBox3fa const& obbox0, BBox3fa const& obbox1, + BBox3fa const& bbox0, BBox3fa const& bbox1, + float t_min, float t_max) const; + + /* calculates the (correct) interpolated bounds */ + __forceinline BBox3fa bounds(size_t itime0, size_t itime1, float f) const + { + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return xfmBounds(slerp(local2world[itime0], local2world[itime1], f), + lerp(getObjectBounds(itime0), getObjectBounds(itime1), f)); + return xfmBounds(lerp(local2world[itime0], local2world[itime1], f), + lerp(getObjectBounds(itime0), getObjectBounds(itime1), f)); + } + + public: + virtual void setNumTimeSteps (unsigned int numTimeSteps) override; + virtual void setInstancedScene(const Ref& scene) override; + virtual void setTransform(const AffineSpace3fa& local2world, unsigned int timeStep) override; + virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) override; + virtual AffineSpace3fa getTransform(float time) override; + virtual void setMask (unsigned mask) override; + virtual void build() {} + virtual void addElementsToCount (GeometryCounts & counts) const override; + virtual void commit() override; + + public: + + /*! calculates the bounds of instance */ + __forceinline BBox3fa bounds(size_t i) const { + assert(i == 0); + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return xfmBounds(quaternionDecompositionToAffineSpace(local2world[0]),object->bounds.bounds()); + return xfmBounds(local2world[0],object->bounds.bounds()); + } + + /*! gets the bounds of the instanced scene */ + __forceinline BBox3fa getObjectBounds(size_t itime) const { + return object->getBounds(timeStep(itime)); + } + + /*! calculates the bounds of instance */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const { + assert(i == 0); + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return xfmBounds(quaternionDecompositionToAffineSpace(local2world[itime]),getObjectBounds(itime)); + return xfmBounds(local2world[itime],getObjectBounds(itime)); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t i, const BBox1f& dt) const { + assert(i == 0); + LBBox3fa lbbox = nonlinearBounds(dt, time_range, fnumTimeSegments); + return lbbox; + } + + /*! calculates the build bounds of the i'th item, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const + { + assert(i==0); + const BBox3fa b = bounds(i); + if (bbox) *bbox = b; + return isvalid(b); + } + + /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + assert(i==0); + const LBBox3fa bounds = linearBounds(i,itime); + bbox = bounds.bounds (); + return isvalid(bounds); + } + + /* gets version info of topology */ + unsigned int getTopologyVersion() const { + return numPrimitives; + } + + /* returns true if topology changed */ + bool topologyChanged(unsigned int otherVersion) const { + return numPrimitives != otherVersion; + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range& itime_range) const + { + assert(i == 0); + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + if (!isvalid(bounds(i,itime))) return false; + + return true; + } + + __forceinline AffineSpace3fa getLocal2World() const + { + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return quaternionDecompositionToAffineSpace(local2world[0]); + return local2world[0]; + } + + __forceinline AffineSpace3fa getLocal2World(float t) const + { + float ftime; const unsigned int itime = timeSegment(t, ftime); + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return slerp(local2world[itime+0],local2world[itime+1],ftime); + return lerp(local2world[itime+0],local2world[itime+1],ftime); + } + + __forceinline AffineSpace3fa getWorld2Local() const { + return world2local0; + } + + __forceinline AffineSpace3fa getWorld2Local(float t) const { + return rcp(getLocal2World(t)); + } + + template + __forceinline AffineSpace3vf getWorld2Local(const vbool& valid, const vfloat& t) const + { + if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION)) + return getWorld2LocalSlerp(valid, t); + return getWorld2LocalLerp(valid, t); + } + + private: + + template + __forceinline AffineSpace3vf getWorld2LocalSlerp(const vbool& valid, const vfloat& t) const + { + vfloat ftime; + const vint itime_k = timeSegment(t, ftime); + assert(any(valid)); + const size_t index = bsf(movemask(valid)); + const int itime = itime_k[index]; + if (likely(all(valid, itime_k == vint(itime)))) { + return rcp(slerp(AffineSpace3vff(local2world[itime+0]), + AffineSpace3vff(local2world[itime+1]), + ftime)); + } + else { + AffineSpace3vff space0,space1; + vbool valid1 = valid; + while (any(valid1)) { + vbool valid2; + const int itime = next_unique(valid1, itime_k, valid2); + space0 = select(valid2, AffineSpace3vff(local2world[itime+0]), space0); + space1 = select(valid2, AffineSpace3vff(local2world[itime+1]), space1); + } + return rcp(slerp(space0, space1, ftime)); + } + } + + template + __forceinline AffineSpace3vf getWorld2LocalLerp(const vbool& valid, const vfloat& t) const + { + vfloat ftime; + const vint itime_k = timeSegment(t, ftime); + assert(any(valid)); + const size_t index = bsf(movemask(valid)); + const int itime = itime_k[index]; + if (likely(all(valid, itime_k == vint(itime)))) { + return rcp(lerp(AffineSpace3vf((AffineSpace3fa)local2world[itime+0]), + AffineSpace3vf((AffineSpace3fa)local2world[itime+1]), + ftime)); + } else { + AffineSpace3vf space0,space1; + vbool valid1 = valid; + while (any(valid1)) { + vbool valid2; + const int itime = next_unique(valid1, itime_k, valid2); + space0 = select(valid2, AffineSpace3vf((AffineSpace3fa)local2world[itime+0]), space0); + space1 = select(valid2, AffineSpace3vf((AffineSpace3fa)local2world[itime+1]), space1); + } + return rcp(lerp(space0, space1, ftime)); + } + } + + public: + Accel* object; //!< pointer to instanced acceleration structure + AffineSpace3ff* local2world; //!< transformation from local space to world space for each timestep (either normal matrix or quaternion decomposition) + AffineSpace3fa world2local0; //!< transformation from world space to local space for timestep 0 + }; + + namespace isa + { + struct InstanceISA : public Instance + { + InstanceISA (Device* device) + : Instance(device) {} + + PrimInfo createPrimRefArray(mvector& prims, const range& r, size_t k, unsigned int geomID) const + { + assert(r.begin() == 0); + assert(r.end() == 1); + + PrimInfo pinfo(empty); + BBox3fa b = empty; + if (!buildBounds(0,&b)) return pinfo; + // const BBox3fa b = bounds(0); + // if (!isvalid(b)) return pinfo; + + const PrimRef prim(b,geomID,unsigned(0)); + pinfo.add_center2(prim); + prims[k++] = prim; + return pinfo; + } + + PrimInfo createPrimRefArrayMB(mvector& prims, size_t itime, const range& r, size_t k, unsigned int geomID) const + { + assert(r.begin() == 0); + assert(r.end() == 1); + + PrimInfo pinfo(empty); + BBox3fa b = empty; + if (!buildBounds(0,&b)) return pinfo; + // if (!valid(0,range(itime))) return pinfo; + // const PrimRef prim(linearBounds(0,itime).bounds(),geomID,unsigned(0)); + const PrimRef prim(b,geomID,unsigned(0)); + pinfo.add_center2(prim); + prims[k++] = prim; + return pinfo; + } + + PrimInfoMB createPrimRefMBArray(mvector& prims, const BBox1f& t0t1, const range& r, size_t k, unsigned int geomID) const + { + assert(r.begin() == 0); + assert(r.end() == 1); + + PrimInfoMB pinfo(empty); + if (!valid(0, timeSegmentRange(t0t1))) return pinfo; + const PrimRefMB prim(linearBounds(0,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(0)); + pinfo.add_primref(prim); + prims[k++] = prim; + return pinfo; + } + }; + } + + DECLARE_ISA_FUNCTION(Instance*, createInstance, Device*); +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h b/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h new file mode 100644 index 00000000000..c0f9ee8f779 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h @@ -0,0 +1,307 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" +#include "geometry.h" +#include "buffer.h" + +namespace embree +{ + /*! represents an array of line segments */ + struct LineSegments : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE2; + + public: + + /*! line segments construction */ + LineSegments (Device* device, Geometry::GType gtype); + + public: + void setMask (unsigned mask); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify (); + void interpolate(const RTCInterpolateArguments* const args); + void setTessellationRate(float N); + void setMaxRadiusScale(float s); + void addElementsToCount (GeometryCounts & counts) const; + + public: + + /*! returns the number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns the i'th segment */ + __forceinline const unsigned int& segment(size_t i) const { + return segments[i]; + } + + /*! returns the segment to the left of the i'th segment */ + __forceinline bool segmentLeftExists(size_t i) const { + assert (flags); + return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_LEFT) != 0; + } + + /*! returns the segment to the right of the i'th segment */ + __forceinline bool segmentRightExists(size_t i) const { + assert (flags); + return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_RIGHT) != 0; + } + + /*! returns i'th vertex of the first time step */ + __forceinline Vec3ff vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th normal of the first time step */ + __forceinline Vec3fa normal(size_t i) const { + return normals0[i]; + } + + /*! returns i'th radius of the first time step */ + __forceinline float radius(size_t i) const { + return vertices0[i].w; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline Vec3ff vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! returns i'th normal of itime'th timestep */ + __forceinline Vec3fa normal(size_t i, size_t itime) const { + return normals[itime][i]; + } + + /*! returns i'th radius of itime'th timestep */ + __forceinline float radius(size_t i, size_t itime) const { + return vertices[itime][i].w; + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(const Vec3ff& v0, const Vec3ff& v1) const + { + const BBox3ff b = merge(BBox3ff(v0),BBox3ff(v1)); + return enlarge((BBox3fa)b,maxRadiusScale*Vec3fa(max(v0.w,v1.w))); + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(size_t i) const + { + const unsigned int index = segment(i); + const Vec3ff v0 = vertex(index+0); + const Vec3ff v1 = vertex(index+1); + return bounds(v0,v1); + } + + /*! calculates bounding box of i'th line segment for the itime'th time step */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const + { + const unsigned int index = segment(i); + const Vec3ff v0 = vertex(index+0,itime); + const Vec3ff v1 = vertex(index+1,itime); + return bounds(v0,v1); + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const + { + const unsigned int index = segment(i); + const Vec3ff v0 = vertex(index+0); + const Vec3ff v1 = vertex(index+1); + const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w); + const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w); + return bounds(w0,w1); + } + + /*! calculates bounding box of i'th line segment for the itime'th time step */ + __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const + { + const unsigned int index = segment(i); + const Vec3ff v0 = vertex(index+0,itime); + const Vec3ff v1 = vertex(index+1,itime); + const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w); + const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w); + return bounds(w0,w1); + } + + /*! check if the i'th primitive is valid at the itime'th timestep */ + __forceinline bool valid(size_t i, size_t itime) const { + return valid(i, make_range(itime, itime)); + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range& itime_range) const + { + const unsigned int index = segment(i); + if (index+1 >= numVertices()) return false; + + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + { + const Vec3ff v0 = vertex(index+0,itime); if (unlikely(!isvalid4(v0))) return false; + const Vec3ff v1 = vertex(index+1,itime); if (unlikely(!isvalid4(v1))) return false; + if (min(v0.w,v1.w) < 0.0f) return false; + } + return true; + } + + /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { + return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1)); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const + { + if (!valid(i,0)) return false; + *bbox = bounds(i); + return true; + } + + /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + if (!valid(i,itime+0) || !valid(i,itime+1)) return false; + bbox = bounds(i,itime); // use bounds of first time step in builder + return true; + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const + { + if (!valid(i, timeSegmentRange(time_range))) return false; + bbox = linearBounds(i, time_range); + return true; + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + public: + BufferView segments; //!< array of line segment indices + BufferView vertices0; //!< fast access to first vertex buffer + BufferView normals0; //!< fast access to first normal buffer + BufferView flags; //!< start, end flag per segment + vector> vertices; //!< vertex array for each timestep + vector> normals; //!< normal array for each timestep + vector> vertexAttribs; //!< user buffers + int tessellationRate; //!< tessellation rate for bezier curve + float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii + }; + + namespace isa + { + struct LineSegmentsISA : public LineSegments + { + LineSegmentsISA (Device* device, Geometry::GType gtype) + : LineSegments(device,gtype) {} + + Vec3fa computeDirection(unsigned int primID) const + { + const unsigned vtxID = segment(primID); + const Vec3fa v0 = vertex(vtxID+0); + const Vec3fa v1 = vertex(vtxID+1); + return v1-v0; + } + + Vec3fa computeDirection(unsigned int primID, size_t time) const + { + const unsigned vtxID = segment(primID); + const Vec3fa v0 = vertex(vtxID+0,time); + const Vec3fa v1 = vertex(vtxID+1,time); + return v1-v0; + } + + PrimInfo createPrimRefArray(mvector& prims, const range& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j& prims, size_t itime, const range& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j& prims, const BBox1f& t0t1, const range& r, size_t k, unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); jnumTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + + BBox3fa vbounds(size_t i) const { + return bounds(i); + } + + BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const { + return bounds(space,i); + } + + LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const { + return linearBounds(primID,time_range); + } + + LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const { + return linearBounds(space,primID,time_range); + } + }; + } + + DECLARE_ISA_FUNCTION(LineSegments*, createLineSegments, Device* COMMA Geometry::GType); +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_points.h b/thirdparty/embree-aarch64/kernels/common/scene_points.h new file mode 100644 index 00000000000..1d39ed07baa --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_points.h @@ -0,0 +1,282 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "buffer.h" +#include "default.h" +#include "geometry.h" + +namespace embree +{ + /*! represents an array of points */ + struct Points : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_POINTS; + + public: + /*! line segments construction */ + Points(Device* device, Geometry::GType gtype); + + public: + void setMask(unsigned mask); + void setNumTimeSteps(unsigned int numTimeSteps); + void setVertexAttributeCount(unsigned int N); + void setBuffer(RTCBufferType type, + unsigned int slot, + RTCFormat format, + const Ref& buffer, + size_t offset, + size_t stride, + unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void setMaxRadiusScale(float s); + void addElementsToCount (GeometryCounts & counts) const; + + public: + /*! returns the number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns i'th vertex of the first time step */ + __forceinline Vec3ff vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th normal of the first time step */ + __forceinline Vec3fa normal(size_t i) const { + return normals0[i]; + } + + /*! returns i'th radius of the first time step */ + __forceinline float radius(size_t i) const { + return vertices0[i].w; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline Vec3ff vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! returns i'th normal of itime'th timestep */ + __forceinline Vec3fa normal(size_t i, size_t itime) const { + return normals[itime][i]; + } + + /*! returns i'th radius of itime'th timestep */ + __forceinline float radius(size_t i, size_t itime) const { + return vertices[itime][i].w; + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(const Vec3ff& v0) const { + return enlarge(BBox3fa(v0), maxRadiusScale*Vec3fa(v0.w)); + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(size_t i) const + { + const Vec3ff v0 = vertex(i); + return bounds(v0); + } + + /*! calculates bounding box of i'th line segment for the itime'th time step */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const + { + const Vec3ff v0 = vertex(i, itime); + return bounds(v0); + } + + /*! calculates bounding box of i'th line segment */ + __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const + { + const Vec3ff v0 = vertex(i); + const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w); + return bounds(w0); + } + + /*! calculates bounding box of i'th line segment for the itime'th time step */ + __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const + { + const Vec3ff v0 = vertex(i, itime); + const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w); + return bounds(w0); + } + + /*! check if the i'th primitive is valid at the itime'th timestep */ + __forceinline bool valid(size_t i, size_t itime) const { + return valid(i, make_range(itime, itime)); + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range& itime_range) const + { + const unsigned int index = (unsigned int)i; + if (index >= numVertices()) + return false; + + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) { + const Vec3ff v0 = vertex(index + 0, itime); + if (unlikely(!isvalid4(v0))) + return false; + if (v0.w < 0.0f) + return false; + } + return true; + } + + /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { + return LBBox3fa(bounds(i, itime + 0), bounds(i, itime + 1)); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const + { + if (!valid(i, 0)) + return false; + *bbox = bounds(i); + return true; + } + + /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */ + __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const + { + if (!valid(i, itime + 0) || !valid(i, itime + 1)) + return false; + bbox = bounds(i, itime); // use bounds of first time step in builder + return true; + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&](size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const { + return LBBox3fa([&](size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const + { + if (!valid(i, timeSegmentRange(time_range))) return false; + bbox = linearBounds(i, time_range); + return true; + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + public: + BufferView vertices0; //!< fast access to first vertex buffer + BufferView normals0; //!< fast access to first normal buffer + vector> vertices; //!< vertex array for each timestep + vector> normals; //!< normal array for each timestep + vector> vertexAttribs; //!< user buffers + float maxRadiusScale = 1.0; //!< maximal min-width scaling of curve radii + }; + + namespace isa + { + struct PointsISA : public Points + { + PointsISA(Device* device, Geometry::GType gtype) : Points(device, gtype) {} + + Vec3fa computeDirection(unsigned int primID) const + { + return Vec3fa(1, 0, 0); + } + + Vec3fa computeDirection(unsigned int primID, size_t time) const + { + return Vec3fa(1, 0, 0); + } + + PrimInfo createPrimRefArray(mvector& prims, const range& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j = r.begin(); j < r.end(); j++) { + BBox3fa bounds = empty; + if (!buildBounds(j, &bounds)) + continue; + const PrimRef prim(bounds, geomID, unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfo createPrimRefArrayMB(mvector& prims, size_t itime, const range& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j = r.begin(); j < r.end(); j++) { + BBox3fa bounds = empty; + if (!buildBounds(j, itime, bounds)) + continue; + const PrimRef prim(bounds, geomID, unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + } + + PrimInfoMB createPrimRefMBArray(mvector& prims, + const BBox1f& t0t1, + const range& r, + size_t k, + unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j = r.begin(); j < r.end(); j++) { + if (!valid(j, timeSegmentRange(t0t1))) + continue; + const PrimRefMB prim(linearBounds(j, t0t1), this->numTimeSegments(), this->time_range, this->numTimeSegments(), geomID, unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + + BBox3fa vbounds(size_t i) const + { + return bounds(i); + } + + BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const + { + return bounds(space, i); + } + + LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const + { + return linearBounds(primID, time_range); + } + + LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const + { + return linearBounds(space, primID, time_range); + } + }; + } // namespace isa + + DECLARE_ISA_FUNCTION(Points*, createPoints, Device* COMMA Geometry::GType); +} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h new file mode 100644 index 00000000000..d5bb054b14a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h @@ -0,0 +1,277 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "geometry.h" +#include "buffer.h" + +namespace embree +{ + /*! Quad Mesh */ + struct QuadMesh : public Geometry + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_QUAD_MESH; + + /*! triangle indices */ + struct Quad + { + uint32_t v[4]; + + /*! outputs triangle indices */ + __forceinline friend embree_ostream operator<<(embree_ostream cout, const Quad& q) { + return cout << "Quad {" << q.v[0] << ", " << q.v[1] << ", " << q.v[2] << ", " << q.v[3] << " }"; + } + }; + + public: + + /*! quad mesh construction */ + QuadMesh (Device* device); + + /* geometry interface */ + public: + void setMask(unsigned mask); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void interpolate(const RTCInterpolateArguments* const args); + void addElementsToCount (GeometryCounts & counts) const; + + public: + + /*! returns number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns i'th quad */ + __forceinline const Quad& quad(size_t i) const { + return quads[i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const Vec3fa vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const Vec3fa vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! calculates the bounds of the i'th quad */ + __forceinline BBox3fa bounds(size_t i) const + { + const Quad& q = quad(i); + const Vec3fa v0 = vertex(q.v[0]); + const Vec3fa v1 = vertex(q.v[1]); + const Vec3fa v2 = vertex(q.v[2]); + const Vec3fa v3 = vertex(q.v[3]); + return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3)); + } + + /*! calculates the bounds of the i'th quad at the itime'th timestep */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const + { + const Quad& q = quad(i); + const Vec3fa v0 = vertex(q.v[0],itime); + const Vec3fa v1 = vertex(q.v[1],itime); + const Vec3fa v2 = vertex(q.v[2],itime); + const Vec3fa v3 = vertex(q.v[3],itime); + return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3)); + } + + /*! check if the i'th primitive is valid at the itime'th timestep */ + __forceinline bool valid(size_t i, size_t itime) const { + return valid(i, make_range(itime, itime)); + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range& itime_range) const + { + const Quad& q = quad(i); + if (unlikely(q.v[0] >= numVertices())) return false; + if (unlikely(q.v[1] >= numVertices())) return false; + if (unlikely(q.v[2] >= numVertices())) return false; + if (unlikely(q.v[3] >= numVertices())) return false; + + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + { + if (!isvalid(vertex(q.v[0],itime))) return false; + if (!isvalid(vertex(q.v[1],itime))) return false; + if (!isvalid(vertex(q.v[2],itime))) return false; + if (!isvalid(vertex(q.v[3],itime))) return false; + } + + return true; + } + + /*! calculates the linear bounds of the i'th quad at the itimeGlobal'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { + return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1)); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const + { + const Quad& q = quad(i); + if (q.v[0] >= numVertices()) return false; + if (q.v[1] >= numVertices()) return false; + if (q.v[2] >= numVertices()) return false; + if (q.v[3] >= numVertices()) return false; + + for (unsigned int t=0; t= numVertices())) return false; + if (unlikely(q.v[1] >= numVertices())) return false; + if (unlikely(q.v[2] >= numVertices())) return false; + if (unlikely(q.v[3] >= numVertices())) return false; + + assert(itime+1 < numTimeSteps); + const Vec3fa a0 = vertex(q.v[0],itime+0); if (unlikely(!isvalid(a0))) return false; + const Vec3fa a1 = vertex(q.v[1],itime+0); if (unlikely(!isvalid(a1))) return false; + const Vec3fa a2 = vertex(q.v[2],itime+0); if (unlikely(!isvalid(a2))) return false; + const Vec3fa a3 = vertex(q.v[3],itime+0); if (unlikely(!isvalid(a3))) return false; + const Vec3fa b0 = vertex(q.v[0],itime+1); if (unlikely(!isvalid(b0))) return false; + const Vec3fa b1 = vertex(q.v[1],itime+1); if (unlikely(!isvalid(b1))) return false; + const Vec3fa b2 = vertex(q.v[2],itime+1); if (unlikely(!isvalid(b2))) return false; + const Vec3fa b3 = vertex(q.v[3],itime+1); if (unlikely(!isvalid(b3))) return false; + + /* use bounds of first time step in builder */ + bbox = BBox3fa(min(a0,a1,a2,a3),max(a0,a1,a2,a3)); + return true; + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const + { + if (!valid(i, timeSegmentRange(dt))) return false; + bbox = linearBounds(i, dt); + return true; + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + /* gets version info of topology */ + unsigned int getTopologyVersion() const { + return quads.modCounter; + } + + /* returns true if topology changed */ + bool topologyChanged(unsigned int otherVersion) const { + return quads.isModified(otherVersion); // || numPrimitivesChanged; + } + + /* returns the projected area */ + __forceinline float projectedPrimitiveArea(const size_t i) const { + const Quad& q = quad(i); + const Vec3fa v0 = vertex(q.v[0]); + const Vec3fa v1 = vertex(q.v[1]); + const Vec3fa v2 = vertex(q.v[2]); + const Vec3fa v3 = vertex(q.v[3]); + return areaProjectedTriangle(v0,v1,v3) + + areaProjectedTriangle(v1,v2,v3); + } + + public: + BufferView quads; //!< array of quads + BufferView vertices0; //!< fast access to first vertex buffer + vector> vertices; //!< vertex array for each timestep + vector> vertexAttribs; //!< vertex attribute buffers + }; + + namespace isa + { + struct QuadMeshISA : public QuadMesh + { + QuadMeshISA (Device* device) + : QuadMesh(device) {} + + PrimInfo createPrimRefArray(mvector& prims, const range& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j& prims, size_t itime, const range& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j& prims, const BBox1f& t0t1, const range& r, size_t k, unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); jnumTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + }; + } + + DECLARE_ISA_FUNCTION(QuadMesh*, createQuadMesh, Device*); +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h new file mode 100644 index 00000000000..d0246009dbd --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h @@ -0,0 +1,326 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "geometry.h" +#include "buffer.h" +#include "../subdiv/half_edge.h" +#include "../subdiv/tessellation_cache.h" +#include "../subdiv/catmullclark_coefficients.h" +#include "../subdiv/patch.h" +#include "../../common/algorithms/parallel_map.h" +#include "../../common/algorithms/parallel_set.h" + +namespace embree +{ + class SubdivMesh : public Geometry + { + ALIGNED_CLASS_(16); + public: + + typedef HalfEdge::Edge Edge; + + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_SUBDIV_MESH; + + /*! structure used to sort half edges using radix sort by their key */ + struct KeyHalfEdge + { + KeyHalfEdge() {} + + KeyHalfEdge (uint64_t key, HalfEdge* edge) + : key(key), edge(edge) {} + + __forceinline operator uint64_t() const { + return key; + } + + friend __forceinline bool operator<(const KeyHalfEdge& e0, const KeyHalfEdge& e1) { + return e0.key < e1.key; + } + + public: + uint64_t key; + HalfEdge* edge; + }; + + public: + + /*! subdiv mesh construction */ + SubdivMesh(Device* device); + + public: + void setMask (unsigned mask); + void setSubdivisionMode (unsigned int topologyID, RTCSubdivisionMode mode); + void setVertexAttributeTopology(unsigned int vertexAttribID, unsigned int topologyID); + void setNumTimeSteps (unsigned int numTimeSteps); + void setVertexAttributeCount (unsigned int N); + void setTopologyCount (unsigned int N); + void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void setTessellationRate(float N); + bool verify(); + void commit(); + void addElementsToCount (GeometryCounts & counts) const; + void setDisplacementFunction (RTCDisplacementFunctionN func); + unsigned int getFirstHalfEdge(unsigned int faceID); + unsigned int getFace(unsigned int edgeID); + unsigned int getNextHalfEdge(unsigned int edgeID); + unsigned int getPreviousHalfEdge(unsigned int edgeID); + unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID); + + public: + + /*! return the number of faces */ + size_t numFaces() const { + return faceVertices.size(); + } + + /*! return the number of edges */ + size_t numEdges() const { + return topology[0].vertexIndices.size(); + } + + /*! return the number of vertices */ + size_t numVertices() const { + return vertices[0].size(); + } + + /*! calculates the bounds of the i'th subdivision patch at the j'th timestep */ + __forceinline BBox3fa bounds(size_t i, size_t j = 0) const { + return topology[0].getHalfEdge(i)->bounds(vertices[j]); + } + + /*! check if the i'th primitive is valid */ + __forceinline bool valid(size_t i) const { + return topology[0].valid(i) && !invalidFace(i); + } + + /*! check if the i'th primitive is valid for the j'th time range */ + __forceinline bool valid(size_t i, size_t j) const { + return topology[0].valid(i) && !invalidFace(i,j); + } + + /*! prints some statistics */ + void printStatistics(); + + /*! initializes the half edge data structure */ + void initializeHalfEdgeStructures (); + + public: + + /*! returns the vertex buffer for some time step */ + __forceinline const BufferView& getVertexBuffer( const size_t t = 0 ) const { + return vertices[t]; + } + + /* returns tessellation level of edge */ + __forceinline float getEdgeLevel(const size_t i) const + { + if (levels) return clamp(levels[i],1.0f,4096.0f); // FIXME: do we want to limit edge level? + else return clamp(tessellationRate,1.0f,4096.0f); // FIXME: do we want to limit edge level? + } + + public: + RTCDisplacementFunctionN displFunc; //!< displacement function + + /*! all buffers in this section are provided by the application */ + public: + + /*! the topology contains all data that may differ when + * interpolating different user data buffers */ + struct Topology + { + public: + + /*! Default topology construction */ + Topology () : halfEdges(nullptr,0) {} + + /*! Topology initialization */ + Topology (SubdivMesh* mesh); + + /*! make the class movable */ + public: + Topology (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows + : mesh(std::move(other.mesh)), + vertexIndices(std::move(other.vertexIndices)), + subdiv_mode(std::move(other.subdiv_mode)), + halfEdges(std::move(other.halfEdges)), + halfEdges0(std::move(other.halfEdges0)), + halfEdges1(std::move(other.halfEdges1)) {} + + Topology& operator= (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows + { + mesh = std::move(other.mesh); + vertexIndices = std::move(other.vertexIndices); + subdiv_mode = std::move(other.subdiv_mode); + halfEdges = std::move(other.halfEdges); + halfEdges0 = std::move(other.halfEdges0); + halfEdges1 = std::move(other.halfEdges1); + return *this; + } + + public: + /*! check if the i'th primitive is valid in this topology */ + __forceinline bool valid(size_t i) const + { + if (unlikely(subdiv_mode == RTC_SUBDIVISION_MODE_NO_BOUNDARY)) { + if (getHalfEdge(i)->faceHasBorder()) return false; + } + return true; + } + + /*! updates the interpolation mode for the topology */ + void setSubdivisionMode (RTCSubdivisionMode mode); + + /*! marks all buffers as modified */ + void update (); + + /*! verifies index array */ + bool verify (size_t numVertices); + + /*! initializes the half edge data structure */ + void initializeHalfEdgeStructures (); + + private: + + /*! recalculates the half edges */ + void calculateHalfEdges(); + + /*! updates half edges when recalculation is not necessary */ + void updateHalfEdges(); + + /*! user input data */ + public: + + SubdivMesh* mesh; + + /*! indices of the vertices composing each face */ + BufferView vertexIndices; + + /*! subdiv interpolation mode */ + RTCSubdivisionMode subdiv_mode; + + /*! generated data */ + public: + + /*! returns the start half edge for face f */ + __forceinline const HalfEdge* getHalfEdge ( const size_t f ) const { + return &halfEdges[mesh->faceStartEdge[f]]; + } + + /*! Half edge structure, generated by initHalfEdgeStructures */ + mvector halfEdges; + + /*! the following data is only required during construction of the + * half edge structure and can be cleared for static scenes */ + private: + + /*! two arrays used to sort the half edges */ + std::vector halfEdges0; + std::vector halfEdges1; + }; + + /*! returns the start half edge for topology t and face f */ + __forceinline const HalfEdge* getHalfEdge ( const size_t t , const size_t f ) const { + return topology[t].getHalfEdge(f); + } + + /*! buffer containing the number of vertices for each face */ + BufferView faceVertices; + + /*! array of topologies */ + vector topology; + + /*! vertex buffer (one buffer for each time step) */ + vector> vertices; + + /*! user data buffers */ + vector vertexAttribs; + + /*! edge crease buffer containing edges (pairs of vertices) that carry edge crease weights */ + BufferView edge_creases; + + /*! edge crease weights for each edge of the edge_creases buffer */ + BufferView edge_crease_weights; + + /*! vertex crease buffer containing all vertices that carry vertex crease weights */ + BufferView vertex_creases; + + /*! vertex crease weights for each vertex of the vertex_creases buffer */ + BufferView vertex_crease_weights; + + /*! subdivision level for each half edge of the vertexIndices buffer */ + BufferView levels; + float tessellationRate; // constant rate that is used when levels is not set + + /*! buffer that marks specific faces as holes */ + BufferView holes; + + /*! all data in this section is generated by initializeHalfEdgeStructures function */ + private: + + /*! number of half edges used by faces */ + size_t numHalfEdges; + + /*! fast lookup table to find the first half edge for some face */ + mvector faceStartEdge; + + /*! fast lookup table to find the face for some half edge */ + mvector halfEdgeFace; + + /*! set with all holes */ + parallel_set holeSet; + + /*! fast lookup table to detect invalid faces */ + mvector invalid_face; + + /*! test if face i is invalid in timestep j */ + __forceinline int8_t& invalidFace(size_t i, size_t j = 0) { return invalid_face[i*numTimeSteps+j]; } + __forceinline const int8_t& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; } + + /*! interpolation cache */ + public: + static __forceinline size_t numInterpolationSlots4(size_t stride) { return (stride+15)/16; } + static __forceinline size_t numInterpolationSlots8(size_t stride) { return (stride+31)/32; } + static __forceinline size_t interpolationSlot(size_t prim, size_t slot, size_t stride) { + const size_t slots = numInterpolationSlots4(stride); + assert(slot < slots); + return slots*prim+slot; + } + std::vector> vertex_buffer_tags; + std::vector> vertex_attrib_buffer_tags; + std::vector patch_eval_trees; + + /*! the following data is only required during construction of the + * half edge structure and can be cleared for static scenes */ + private: + + /*! map with all vertex creases */ + parallel_map vertexCreaseMap; + + /*! map with all edge creases */ + parallel_map edgeCreaseMap; + + protected: + + /*! counts number of geometry commits */ + size_t commitCounter; + }; + + namespace isa + { + struct SubdivMeshISA : public SubdivMesh + { + SubdivMeshISA (Device* device) + : SubdivMesh(device) {} + + void interpolate(const RTCInterpolateArguments* const args); + void interpolateN(const RTCInterpolateNArguments* const args); + }; + } + + DECLARE_ISA_FUNCTION(SubdivMesh*, createSubdivMesh, Device*); +}; diff --git a/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp new file mode 100644 index 00000000000..d1c2750f14a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp @@ -0,0 +1,243 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "scene_triangle_mesh.h" +#include "scene.h" + +namespace embree +{ +#if defined(EMBREE_LOWEST_ISA) + + TriangleMesh::TriangleMesh (Device* device) + : Geometry(device,GTY_TRIANGLE_MESH,0,1) + { + vertices.resize(numTimeSteps); + } + + void TriangleMesh::setMask (unsigned mask) + { + this->mask = mask; + Geometry::update(); + } + + void TriangleMesh::setNumTimeSteps (unsigned int numTimeSteps) + { + vertices.resize(numTimeSteps); + Geometry::setNumTimeSteps(numTimeSteps); + } + + void TriangleMesh::setVertexAttributeCount (unsigned int N) + { + vertexAttribs.resize(N); + Geometry::update(); + } + + void TriangleMesh::setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref& buffer, size_t offset, size_t stride, unsigned int num) + { + /* verify that all accesses are 4 bytes aligned */ + if (((size_t(buffer->getPtr()) + offset) & 0x3) || (stride & 0x3)) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "data must be 4 bytes aligned"); + + if (type == RTC_BUFFER_TYPE_VERTEX) + { + if (format != RTC_FORMAT_FLOAT3) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex buffer format"); + + /* if buffer is larger than 16GB the premultiplied index optimization does not work */ + if (stride*num > 16ll*1024ll*1024ll*1024ll) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "vertex buffer can be at most 16GB large"); + + if (slot >= vertices.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid vertex buffer slot"); + + vertices[slot].set(buffer, offset, stride, num, format); + vertices[slot].checkPadding16(); + vertices0 = vertices[0]; + } + else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) + { + if (format < RTC_FORMAT_FLOAT || format > RTC_FORMAT_FLOAT16) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer format"); + + if (slot >= vertexAttribs.size()) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer slot"); + + vertexAttribs[slot].set(buffer, offset, stride, num, format); + vertexAttribs[slot].checkPadding16(); + } + else if (type == RTC_BUFFER_TYPE_INDEX) + { + if (slot != 0) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + if (format != RTC_FORMAT_UINT3) + throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid index buffer format"); + + triangles.set(buffer, offset, stride, num, format); + setNumPrimitives(num); + } + else + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type"); + } + + void* TriangleMesh::getBuffer(RTCBufferType type, unsigned int slot) + { + if (type == RTC_BUFFER_TYPE_INDEX) + { + if (slot != 0) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + return triangles.getPtr(); + } + else if (type == RTC_BUFFER_TYPE_VERTEX) + { + if (slot >= vertices.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + return vertices[slot].getPtr(); + } + else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) + { + if (slot >= vertexAttribs.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + return vertexAttribs[slot].getPtr(); + } + else + { + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type"); + return nullptr; + } + } + + void TriangleMesh::updateBuffer(RTCBufferType type, unsigned int slot) + { + if (type == RTC_BUFFER_TYPE_INDEX) + { + if (slot != 0) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + triangles.setModified(); + } + else if (type == RTC_BUFFER_TYPE_VERTEX) + { + if (slot >= vertices.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + vertices[slot].setModified(); + } + else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) + { + if (slot >= vertexAttribs.size()) + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot"); + vertexAttribs[slot].setModified(); + } + else + { + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type"); + } + + Geometry::update(); + } + + void TriangleMesh::commit() + { + /* verify that stride of all time steps are identical */ + for (unsigned int t=0; t= numVertices()) return false; + if (triangles[i].v[1] >= numVertices()) return false; + if (triangles[i].v[2] >= numVertices()) return false; + } + + /*! verify vertices */ + for (const auto& buffer : vertices) + for (size_t i=0; iprimID; + float u = args->u; + float v = args->v; + RTCBufferType bufferType = args->bufferType; + unsigned int bufferSlot = args->bufferSlot; + float* P = args->P; + float* dPdu = args->dPdu; + float* dPdv = args->dPdv; + float* ddPdudu = args->ddPdudu; + float* ddPdvdv = args->ddPdvdv; + float* ddPdudv = args->ddPdudv; + unsigned int valueCount = args->valueCount; + + /* calculate base pointer and stride */ + assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) || + (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size())); + const char* src = nullptr; + size_t stride = 0; + if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) { + src = vertexAttribs[bufferSlot].getPtr(); + stride = vertexAttribs[bufferSlot].getStride(); + } else { + src = vertices[bufferSlot].getPtr(); + stride = vertices[bufferSlot].getStride(); + } + + for (unsigned int i=0; i& buffer, size_t offset, size_t stride, unsigned int num); + void* getBuffer(RTCBufferType type, unsigned int slot); + void updateBuffer(RTCBufferType type, unsigned int slot); + void commit(); + bool verify(); + void interpolate(const RTCInterpolateArguments* const args); + void addElementsToCount (GeometryCounts & counts) const; + + public: + + /*! returns number of vertices */ + __forceinline size_t numVertices() const { + return vertices[0].size(); + } + + /*! returns i'th triangle*/ + __forceinline const Triangle& triangle(size_t i) const { + return triangles[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const Vec3fa vertex(size_t i) const { + return vertices0[i]; + } + + /*! returns i'th vertex of the first time step */ + __forceinline const char* vertexPtr(size_t i) const { + return vertices0.getPtr(i); + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const Vec3fa vertex(size_t i, size_t itime) const { + return vertices[itime][i]; + } + + /*! returns i'th vertex of itime'th timestep */ + __forceinline const char* vertexPtr(size_t i, size_t itime) const { + return vertices[itime].getPtr(i); + } + + /*! calculates the bounds of the i'th triangle */ + __forceinline BBox3fa bounds(size_t i) const + { + const Triangle& tri = triangle(i); + const Vec3fa v0 = vertex(tri.v[0]); + const Vec3fa v1 = vertex(tri.v[1]); + const Vec3fa v2 = vertex(tri.v[2]); + return BBox3fa(min(v0,v1,v2),max(v0,v1,v2)); + } + + /*! calculates the bounds of the i'th triangle at the itime'th timestep */ + __forceinline BBox3fa bounds(size_t i, size_t itime) const + { + const Triangle& tri = triangle(i); + const Vec3fa v0 = vertex(tri.v[0],itime); + const Vec3fa v1 = vertex(tri.v[1],itime); + const Vec3fa v2 = vertex(tri.v[2],itime); + return BBox3fa(min(v0,v1,v2),max(v0,v1,v2)); + } + + /*! check if the i'th primitive is valid at the itime'th timestep */ + __forceinline bool valid(size_t i, size_t itime) const { + return valid(i, make_range(itime, itime)); + } + + /*! check if the i'th primitive is valid between the specified time range */ + __forceinline bool valid(size_t i, const range& itime_range) const + { + const Triangle& tri = triangle(i); + if (unlikely(tri.v[0] >= numVertices())) return false; + if (unlikely(tri.v[1] >= numVertices())) return false; + if (unlikely(tri.v[2] >= numVertices())) return false; + + for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) + { + if (!isvalid(vertex(tri.v[0],itime))) return false; + if (!isvalid(vertex(tri.v[1],itime))) return false; + if (!isvalid(vertex(tri.v[2],itime))) return false; + } + + return true; + } + + /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */ + __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const { + return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1)); + } + + /*! calculates the build bounds of the i'th primitive, if it's valid */ + __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const + { + const Triangle& tri = triangle(i); + if (unlikely(tri.v[0] >= numVertices())) return false; + if (unlikely(tri.v[1] >= numVertices())) return false; + if (unlikely(tri.v[2] >= numVertices())) return false; + + for (size_t t=0; t= numVertices())) return false; + if (unlikely(tri.v[1] >= numVertices())) return false; + if (unlikely(tri.v[2] >= numVertices())) return false; + + assert(itime+1 < numTimeSteps); + const Vec3fa a0 = vertex(tri.v[0],itime+0); if (unlikely(!isvalid(a0))) return false; + const Vec3fa a1 = vertex(tri.v[1],itime+0); if (unlikely(!isvalid(a1))) return false; + const Vec3fa a2 = vertex(tri.v[2],itime+0); if (unlikely(!isvalid(a2))) return false; + const Vec3fa b0 = vertex(tri.v[0],itime+1); if (unlikely(!isvalid(b0))) return false; + const Vec3fa b1 = vertex(tri.v[1],itime+1); if (unlikely(!isvalid(b1))) return false; + const Vec3fa b2 = vertex(tri.v[2],itime+1); if (unlikely(!isvalid(b2))) return false; + + /* use bounds of first time step in builder */ + bbox = BBox3fa(min(a0,a1,a2),max(a0,a1,a2)); + return true; + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const { + return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments); + } + + /*! calculates the linear bounds of the i'th primitive for the specified time range */ + __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const { + if (!valid(i, timeSegmentRange(dt))) return false; + bbox = linearBounds(i, dt); + return true; + } + + /*! get fast access to first vertex buffer */ + __forceinline float * getCompactVertexArray () const { + return (float*) vertices0.getPtr(); + } + + /* gets version info of topology */ + unsigned int getTopologyVersion() const { + return triangles.modCounter; + } + + /* returns true if topology changed */ + bool topologyChanged(unsigned int otherVersion) const { + return triangles.isModified(otherVersion); // || numPrimitivesChanged; + } + + /* returns the projected area */ + __forceinline float projectedPrimitiveArea(const size_t i) const { + const Triangle& tri = triangle(i); + const Vec3fa v0 = vertex(tri.v[0]); + const Vec3fa v1 = vertex(tri.v[1]); + const Vec3fa v2 = vertex(tri.v[2]); + return areaProjectedTriangle(v0,v1,v2); + } + + public: + BufferView triangles; //!< array of triangles + BufferView vertices0; //!< fast access to first vertex buffer + vector> vertices; //!< vertex array for each timestep + vector vertexAttribs; //!< vertex attributes + }; + + namespace isa + { + struct TriangleMeshISA : public TriangleMesh + { + TriangleMeshISA (Device* device) + : TriangleMesh(device) {} + + PrimInfo createPrimRefArray(mvector& prims, const range& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j& prims, size_t itime, const range& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j& prims, const BBox1f& t0t1, const range& r, size_t k, unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); jnumTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + }; + } + + DECLARE_ISA_FUNCTION(TriangleMesh*, createTriangleMesh, Device*); +} diff --git a/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h b/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h new file mode 100644 index 00000000000..8d11ed69860 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h @@ -0,0 +1,77 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "accelset.h" + +namespace embree +{ + /*! User geometry with user defined intersection functions */ + struct UserGeometry : public AccelSet + { + /*! type of this geometry */ + static const Geometry::GTypeMask geom_type = Geometry::MTY_USER_GEOMETRY; + + public: + UserGeometry (Device* device, unsigned int items = 0, unsigned int numTimeSteps = 1); + virtual void setMask (unsigned mask); + virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr); + virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect); + virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded); + virtual void build() {} + virtual void addElementsToCount (GeometryCounts & counts) const; + }; + + namespace isa + { + struct UserGeometryISA : public UserGeometry + { + UserGeometryISA (Device* device) + : UserGeometry(device) {} + + PrimInfo createPrimRefArray(mvector& prims, const range& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j& prims, size_t itime, const range& r, size_t k, unsigned int geomID) const + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j& prims, const BBox1f& t0t1, const range& r, size_t k, unsigned int geomID) const + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); jnumTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j)); + pinfo.add_primref(prim); + prims[k++] = prim; + } + return pinfo; + } + }; + } + + DECLARE_ISA_FUNCTION(UserGeometry*, createUserGeometry, Device*); +} diff --git a/thirdparty/embree-aarch64/kernels/common/stack_item.h b/thirdparty/embree-aarch64/kernels/common/stack_item.h new file mode 100644 index 00000000000..533c3853652 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/stack_item.h @@ -0,0 +1,125 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /*! An item on the stack holds the node ID and distance of that node. */ + template + struct __aligned(16) StackItemT + { + /*! assert that the xchg function works */ + static_assert(sizeof(T) <= 12, "sizeof(T) <= 12 failed"); + + __forceinline StackItemT() {} + + __forceinline StackItemT(T &ptr, unsigned &dist) : ptr(ptr), dist(dist) {} + + /*! use SSE instructions to swap stack items */ + __forceinline static void xchg(StackItemT& a, StackItemT& b) + { + const vfloat4 sse_a = vfloat4::load((float*)&a); + const vfloat4 sse_b = vfloat4::load((float*)&b); + vfloat4::store(&a,sse_b); + vfloat4::store(&b,sse_a); + } + + /*! Sort 2 stack items. */ + __forceinline friend void sort(StackItemT& s1, StackItemT& s2) { + if (s2.dist < s1.dist) xchg(s2,s1); + } + + /*! Sort 3 stack items. */ + __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3) + { + if (s2.dist < s1.dist) xchg(s2,s1); + if (s3.dist < s2.dist) xchg(s3,s2); + if (s2.dist < s1.dist) xchg(s2,s1); + } + + /*! Sort 4 stack items. */ + __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3, StackItemT& s4) + { + if (s2.dist < s1.dist) xchg(s2,s1); + if (s4.dist < s3.dist) xchg(s4,s3); + if (s3.dist < s1.dist) xchg(s3,s1); + if (s4.dist < s2.dist) xchg(s4,s2); + if (s3.dist < s2.dist) xchg(s3,s2); + } + + /*! use SSE instructions to swap stack items */ + __forceinline static void cmp_xchg(vint4& a, vint4& b) + { +#if defined(__AVX512VL__) + const vboolf4 mask(shuffle<2,2,2,2>(b) < shuffle<2,2,2,2>(a)); +#else + const vboolf4 mask0(b < a); + const vboolf4 mask(shuffle<2,2,2,2>(mask0)); +#endif + const vint4 c = select(mask,b,a); + const vint4 d = select(mask,a,b); + a = c; + b = d; + } + + /*! Sort 3 stack items. */ + __forceinline static void sort3(vint4& s1, vint4& s2, vint4& s3) + { + cmp_xchg(s2,s1); + cmp_xchg(s3,s2); + cmp_xchg(s2,s1); + } + + /*! Sort 4 stack items. */ + __forceinline static void sort4(vint4& s1, vint4& s2, vint4& s3, vint4& s4) + { + cmp_xchg(s2,s1); + cmp_xchg(s4,s3); + cmp_xchg(s3,s1); + cmp_xchg(s4,s2); + cmp_xchg(s3,s2); + } + + + /*! Sort N stack items. */ + __forceinline friend void sort(StackItemT* begin, StackItemT* end) + { + for (StackItemT* i = begin+1; i != end; ++i) + { + const vfloat4 item = vfloat4::load((float*)i); + const unsigned dist = i->dist; + StackItemT* j = i; + + while ((j != begin) && ((j-1)->dist < dist)) + { + vfloat4::store(j, vfloat4::load((float*)(j-1))); + --j; + } + + vfloat4::store(j, item); + } + } + + public: + T ptr; + unsigned dist; + }; + + /*! An item on the stack holds the node ID and active ray mask. */ + template + struct __aligned(8) StackItemMaskT + { + T ptr; + size_t mask; + }; + + struct __aligned(8) StackItemMaskCoherent + { + size_t mask; + size_t parent; + size_t child; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/stat.cpp b/thirdparty/embree-aarch64/kernels/common/stat.cpp new file mode 100644 index 00000000000..b73c3a8c766 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/stat.cpp @@ -0,0 +1,128 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "stat.h" + +namespace embree +{ + Stat Stat::instance; + + Stat::Stat () { + } + + Stat::~Stat () + { +#ifdef EMBREE_STAT_COUNTERS + Stat::print(std::cout); +#endif + } + + void Stat::print(std::ostream& cout) + { + Counters& cntrs = instance.cntrs; + Counters::Data& data = instance.cntrs.code; + //Counters::Data& data = instance.cntrs.active; + + /* print absolute numbers */ + cout << "--------- ABSOLUTE ---------" << std::endl; + cout << " #normal_travs = " << float(data.normal.travs )*1E-6 << "M" << std::endl; + cout << " #nodes = " << float(data.normal.trav_nodes )*1E-6 << "M" << std::endl; + cout << " #nodes_xfm = " << float(data.normal.trav_xfm_nodes )*1E-6 << "M" << std::endl; + cout << " #leaves = " << float(data.normal.trav_leaves )*1E-6 << "M" << std::endl; + cout << " #prims = " << float(data.normal.trav_prims )*1E-6 << "M" << std::endl; + cout << " #prim_hits = " << float(data.normal.trav_prim_hits )*1E-6 << "M" << std::endl; + + cout << " #stack nodes = " << float(data.normal.trav_stack_nodes )*1E-6 << "M" << std::endl; + cout << " #stack pop = " << float(data.normal.trav_stack_pop )*1E-6 << "M" << std::endl; + + size_t normal_box_hits = 0; + size_t weighted_box_hits = 0; + for (size_t i=0;i travs; + std::atomic trav_nodes; + std::atomic trav_leaves; + std::atomic trav_prims; + std::atomic trav_prim_hits; + std::atomic trav_hit_boxes[SIZE_HISTOGRAM+1]; + std::atomic trav_stack_pop; + std::atomic trav_stack_nodes; + std::atomic trav_xfm_nodes; + + } normal, shadow, point_query; + } all, active, code; + + std::atomic user[10]; + }; + + public: + + static __forceinline Counters& get() { + return instance.cntrs; + } + + static void clear() { + instance.cntrs.clear(); + } + + static void print(embree_ostream cout); + + private: + Counters cntrs; + + private: + static Stat instance; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/state.cpp b/thirdparty/embree-aarch64/kernels/common/state.cpp new file mode 100644 index 00000000000..51fc9b78262 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/state.cpp @@ -0,0 +1,543 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "state.h" +#include "../../common/lexers/streamfilters.h" + +namespace embree +{ + MutexSys g_printMutex; + + State::ErrorHandler State::g_errorHandler; + + State::ErrorHandler::ErrorHandler() + : thread_error(createTls()) {} + + State::ErrorHandler::~ErrorHandler() + { + Lock lock(errors_mutex); + for (size_t i=0; i lock(errors_mutex); + stored_error = new RTCError(RTC_ERROR_NONE); + thread_errors.push_back(stored_error); + setTls(thread_error,stored_error); + return stored_error; + } + + State::State () + : enabled_cpu_features(getCPUFeatures()), + enabled_builder_cpu_features(enabled_cpu_features), + frequency_level(FREQUENCY_SIMD256) + { + tri_accel = "default"; + tri_builder = "default"; + tri_traverser = "default"; + + tri_accel_mb = "default"; + tri_builder_mb = "default"; + tri_traverser_mb = "default"; + + quad_accel = "default"; + quad_builder = "default"; + quad_traverser = "default"; + + quad_accel_mb = "default"; + quad_builder_mb = "default"; + quad_traverser_mb = "default"; + + line_accel = "default"; + line_builder = "default"; + line_traverser = "default"; + + line_accel_mb = "default"; + line_builder_mb = "default"; + line_traverser_mb = "default"; + + hair_accel = "default"; + hair_builder = "default"; + hair_traverser = "default"; + + hair_accel_mb = "default"; + hair_builder_mb = "default"; + hair_traverser_mb = "default"; + + object_accel = "default"; + object_builder = "default"; + object_accel_min_leaf_size = 1; + object_accel_max_leaf_size = 1; + + object_accel_mb = "default"; + object_builder_mb = "default"; + object_accel_mb_min_leaf_size = 1; + object_accel_mb_max_leaf_size = 1; + + max_spatial_split_replications = 1.2f; + useSpatialPreSplits = false; + + tessellation_cache_size = 128*1024*1024; + + subdiv_accel = "default"; + subdiv_accel_mb = "default"; + + grid_accel = "default"; + grid_builder = "default"; + grid_accel_mb = "default"; + grid_builder_mb = "default"; + + instancing_open_min = 0; + instancing_block_size = 0; + instancing_open_factor = 8.0f; + instancing_open_max_depth = 32; + instancing_open_max = 50000000; + + ignore_config_files = false; + float_exceptions = false; + quality_flags = -1; + scene_flags = -1; + verbose = 0; + benchmark = 0; + + numThreads = 0; + numUserThreads = 0; + +#if TASKING_INTERNAL + set_affinity = true; +#else + set_affinity = false; +#endif + /* per default enable affinity on KNL */ + if (hasISA(AVX512KNL)) set_affinity = true; + + start_threads = false; + enable_selockmemoryprivilege = false; +#if defined(__LINUX__) + hugepages = true; +#else + hugepages = false; +#endif + hugepages_success = true; + + alloc_main_block_size = 0; + alloc_num_main_slots = 0; + alloc_thread_block_size = 0; + alloc_single_thread_alloc = -1; + + error_function = nullptr; + error_function_userptr = nullptr; + + memory_monitor_function = nullptr; + memory_monitor_userptr = nullptr; + } + + State::~State() { + } + + bool State::hasISA(const int isa) { + return (enabled_cpu_features & isa) == isa; + } + + bool State::checkISASupport() { +#if defined(__ARM_NEON) + /* + * NEON CPU type is a mixture of NEON and SSE2 + */ + + bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2; + + /* this will be true when explicitly initialize Device with `isa=neon` config */ + bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON; + + return hasSSE2 || hasNEON; +#else + return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features; +#endif + } + + void State::verify() + { + /* verify that calculations stay in range */ + assert(rcp(min_rcp_input)*FLT_LARGE+FLT_LARGE < 0.01f*FLT_MAX); + + /* here we verify that CPP files compiled for a specific ISA only + * call that same or lower ISA version of non-inlined class member + * functions */ +#if defined(DEBUG) +#if defined(EMBREE_TARGET_SSE2) +#if !defined(__ARM_NEON) + assert(sse2::getISA() <= SSE2); +#endif +#endif +#if defined(EMBREE_TARGET_SSE42) + assert(sse42::getISA() <= SSE42); +#endif +#if defined(EMBREE_TARGET_AVX) + assert(avx::getISA() <= AVX); +#endif +#if defined(EMBREE_TARGET_AVX2) + assert(avx2::getISA() <= AVX2); +#endif +#if defined (EMBREE_TARGET_AVX512KNL) + assert(avx512knl::getISA() <= AVX512KNL); +#endif +#if defined (EMBREE_TARGET_AVX512SKX) + assert(avx512skx::getISA() <= AVX512SKX); +#endif +#endif + } + + const char* symbols[3] = { "=", ",", "|" }; + + bool State::parseFile(const FileName& fileName) + { + FILE* f = fopen(fileName.c_str(),"r"); + if (!f) return false; + Ref > file = new FileStream(f,fileName); + + std::vector syms; + for (size_t i=0; i cin = new TokenStream(new LineCommentFilter(file,"#"), + TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.", + TokenStream::separators,syms); + parse(cin); + return true; + } + + void State::parseString(const char* cfg) + { + if (cfg == nullptr) return; + + std::vector syms; + for (size_t i=0; i cin = new TokenStream(new StrStream(cfg), + TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.", + TokenStream::separators,syms); + parse(cin); + } + + int string_to_cpufeatures(const std::string& isa) + { + if (isa == "sse" ) return SSE; + else if (isa == "sse2") return SSE2; + else if (isa == "sse3") return SSE3; + else if (isa == "ssse3") return SSSE3; + else if (isa == "sse41") return SSE41; + else if (isa == "sse4.1") return SSE41; + else if (isa == "sse42") return SSE42; + else if (isa == "sse4.2") return SSE42; + else if (isa == "avx") return AVX; + else if (isa == "avxi") return AVXI; + else if (isa == "avx2") return AVX2; + else if (isa == "avx512knl") return AVX512KNL; + else if (isa == "avx512skx") return AVX512SKX; + else return SSE2; + } + + void State::parse(Ref cin) + { + /* parse until end of stream */ + while (cin->peek() != Token::Eof()) + { + const Token tok = cin->get(); + + if (tok == Token::Id("threads") && cin->trySymbol("=")) + numThreads = cin->get().Int(); + + else if (tok == Token::Id("user_threads")&& cin->trySymbol("=")) + numUserThreads = cin->get().Int(); + + else if (tok == Token::Id("set_affinity")&& cin->trySymbol("=")) + set_affinity = cin->get().Int(); + + else if (tok == Token::Id("affinity")&& cin->trySymbol("=")) + set_affinity = cin->get().Int(); + + else if (tok == Token::Id("start_threads")&& cin->trySymbol("=")) + start_threads = cin->get().Int(); + + else if (tok == Token::Id("isa") && cin->trySymbol("=")) { + std::string isa = toLowerCase(cin->get().Identifier()); + enabled_cpu_features = string_to_cpufeatures(isa); + enabled_builder_cpu_features = enabled_cpu_features; + } + + else if (tok == Token::Id("max_isa") && cin->trySymbol("=")) { + std::string isa = toLowerCase(cin->get().Identifier()); + enabled_cpu_features &= string_to_cpufeatures(isa); + enabled_builder_cpu_features &= enabled_cpu_features; + } + + else if (tok == Token::Id("max_builder_isa") && cin->trySymbol("=")) { + std::string isa = toLowerCase(cin->get().Identifier()); + enabled_builder_cpu_features &= string_to_cpufeatures(isa); + } + + else if (tok == Token::Id("frequency_level") && cin->trySymbol("=")) { + std::string freq = cin->get().Identifier(); + if (freq == "simd128") frequency_level = FREQUENCY_SIMD128; + else if (freq == "simd256") frequency_level = FREQUENCY_SIMD256; + else if (freq == "simd512") frequency_level = FREQUENCY_SIMD512; + } + + else if (tok == Token::Id("enable_selockmemoryprivilege") && cin->trySymbol("=")) { + enable_selockmemoryprivilege = cin->get().Int(); + } + else if (tok == Token::Id("hugepages") && cin->trySymbol("=")) { + hugepages = cin->get().Int(); + } + + else if (tok == Token::Id("ignore_config_files") && cin->trySymbol("=")) + ignore_config_files = cin->get().Int(); + else if (tok == Token::Id("float_exceptions") && cin->trySymbol("=")) + float_exceptions = cin->get().Int(); + + else if ((tok == Token::Id("tri_accel") || tok == Token::Id("accel")) && cin->trySymbol("=")) + tri_accel = cin->get().Identifier(); + else if ((tok == Token::Id("tri_builder") || tok == Token::Id("builder")) && cin->trySymbol("=")) + tri_builder = cin->get().Identifier(); + else if ((tok == Token::Id("tri_traverser") || tok == Token::Id("traverser")) && cin->trySymbol("=")) + tri_traverser = cin->get().Identifier(); + + else if ((tok == Token::Id("tri_accel_mb") || tok == Token::Id("accel_mb")) && cin->trySymbol("=")) + tri_accel_mb = cin->get().Identifier(); + else if ((tok == Token::Id("tri_builder_mb") || tok == Token::Id("builder_mb")) && cin->trySymbol("=")) + tri_builder_mb = cin->get().Identifier(); + else if ((tok == Token::Id("tri_traverser_mb") || tok == Token::Id("traverser_mb")) && cin->trySymbol("=")) + tri_traverser_mb = cin->get().Identifier(); + + else if ((tok == Token::Id("quad_accel")) && cin->trySymbol("=")) + quad_accel = cin->get().Identifier(); + else if ((tok == Token::Id("quad_builder")) && cin->trySymbol("=")) + quad_builder = cin->get().Identifier(); + else if ((tok == Token::Id("quad_traverser")) && cin->trySymbol("=")) + quad_traverser = cin->get().Identifier(); + + else if ((tok == Token::Id("quad_accel_mb")) && cin->trySymbol("=")) + quad_accel_mb = cin->get().Identifier(); + else if ((tok == Token::Id("quad_builder_mb")) && cin->trySymbol("=")) + quad_builder_mb = cin->get().Identifier(); + else if ((tok == Token::Id("quad_traverser_mb")) && cin->trySymbol("=")) + quad_traverser_mb = cin->get().Identifier(); + + else if ((tok == Token::Id("line_accel")) && cin->trySymbol("=")) + line_accel = cin->get().Identifier(); + else if ((tok == Token::Id("line_builder")) && cin->trySymbol("=")) + line_builder = cin->get().Identifier(); + else if ((tok == Token::Id("line_traverser")) && cin->trySymbol("=")) + line_traverser = cin->get().Identifier(); + + else if ((tok == Token::Id("line_accel_mb")) && cin->trySymbol("=")) + line_accel_mb = cin->get().Identifier(); + else if ((tok == Token::Id("line_builder_mb")) && cin->trySymbol("=")) + line_builder_mb = cin->get().Identifier(); + else if ((tok == Token::Id("line_traverser_mb")) && cin->trySymbol("=")) + line_traverser_mb = cin->get().Identifier(); + + else if (tok == Token::Id("hair_accel") && cin->trySymbol("=")) + hair_accel = cin->get().Identifier(); + else if (tok == Token::Id("hair_builder") && cin->trySymbol("=")) + hair_builder = cin->get().Identifier(); + else if (tok == Token::Id("hair_traverser") && cin->trySymbol("=")) + hair_traverser = cin->get().Identifier(); + + else if (tok == Token::Id("hair_accel_mb") && cin->trySymbol("=")) + hair_accel_mb = cin->get().Identifier(); + else if (tok == Token::Id("hair_builder_mb") && cin->trySymbol("=")) + hair_builder_mb = cin->get().Identifier(); + else if (tok == Token::Id("hair_traverser_mb") && cin->trySymbol("=")) + hair_traverser_mb = cin->get().Identifier(); + + else if (tok == Token::Id("object_accel") && cin->trySymbol("=")) + object_accel = cin->get().Identifier(); + else if (tok == Token::Id("object_builder") && cin->trySymbol("=")) + object_builder = cin->get().Identifier(); + else if (tok == Token::Id("object_accel_min_leaf_size") && cin->trySymbol("=")) + object_accel_min_leaf_size = cin->get().Int(); + else if (tok == Token::Id("object_accel_max_leaf_size") && cin->trySymbol("=")) + object_accel_max_leaf_size = cin->get().Int(); + + else if (tok == Token::Id("object_accel_mb") && cin->trySymbol("=")) + object_accel_mb = cin->get().Identifier(); + else if (tok == Token::Id("object_builder_mb") && cin->trySymbol("=")) + object_builder_mb = cin->get().Identifier(); + else if (tok == Token::Id("object_accel_mb_min_leaf_size") && cin->trySymbol("=")) + object_accel_mb_min_leaf_size = cin->get().Int(); + else if (tok == Token::Id("object_accel_mb_max_leaf_size") && cin->trySymbol("=")) + object_accel_mb_max_leaf_size = cin->get().Int(); + + else if (tok == Token::Id("instancing_open_min") && cin->trySymbol("=")) + instancing_open_min = cin->get().Int(); + else if (tok == Token::Id("instancing_block_size") && cin->trySymbol("=")) { + instancing_block_size = cin->get().Int(); + instancing_open_factor = 0.0f; + } + else if (tok == Token::Id("instancing_open_max_depth") && cin->trySymbol("=")) + instancing_open_max_depth = cin->get().Int(); + else if (tok == Token::Id("instancing_open_factor") && cin->trySymbol("=")) { + instancing_block_size = 0; + instancing_open_factor = cin->get().Float(); + } + else if (tok == Token::Id("instancing_open_max") && cin->trySymbol("=")) + instancing_open_max = cin->get().Int(); + + else if (tok == Token::Id("subdiv_accel") && cin->trySymbol("=")) + subdiv_accel = cin->get().Identifier(); + else if (tok == Token::Id("subdiv_accel_mb") && cin->trySymbol("=")) + subdiv_accel_mb = cin->get().Identifier(); + + else if (tok == Token::Id("grid_accel") && cin->trySymbol("=")) + grid_accel = cin->get().Identifier(); + else if (tok == Token::Id("grid_accel_mb") && cin->trySymbol("=")) + grid_accel_mb = cin->get().Identifier(); + + else if (tok == Token::Id("verbose") && cin->trySymbol("=")) + verbose = cin->get().Int(); + else if (tok == Token::Id("benchmark") && cin->trySymbol("=")) + benchmark = cin->get().Int(); + + else if (tok == Token::Id("quality")) { + if (cin->trySymbol("=")) { + Token flag = cin->get(); + if (flag == Token::Id("low")) quality_flags = RTC_BUILD_QUALITY_LOW; + else if (flag == Token::Id("medium")) quality_flags = RTC_BUILD_QUALITY_MEDIUM; + else if (flag == Token::Id("high")) quality_flags = RTC_BUILD_QUALITY_HIGH; + } + } + + else if (tok == Token::Id("scene_flags")) { + scene_flags = 0; + if (cin->trySymbol("=")) { + do { + Token flag = cin->get(); + if (flag == Token::Id("dynamic") ) scene_flags |= RTC_SCENE_FLAG_DYNAMIC; + else if (flag == Token::Id("compact")) scene_flags |= RTC_SCENE_FLAG_COMPACT; + else if (flag == Token::Id("robust")) scene_flags |= RTC_SCENE_FLAG_ROBUST; + } while (cin->trySymbol("|")); + } + } + + else if (tok == Token::Id("max_spatial_split_replications") && cin->trySymbol("=")) + max_spatial_split_replications = cin->get().Float(); + + else if (tok == Token::Id("presplits") && cin->trySymbol("=")) + useSpatialPreSplits = cin->get().Int() != 0 ? true : false; + + else if (tok == Token::Id("tessellation_cache_size") && cin->trySymbol("=")) + tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f); + else if (tok == Token::Id("cache_size") && cin->trySymbol("=")) + tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f); + + else if (tok == Token::Id("alloc_main_block_size") && cin->trySymbol("=")) + alloc_main_block_size = cin->get().Int(); + else if (tok == Token::Id("alloc_num_main_slots") && cin->trySymbol("=")) + alloc_num_main_slots = cin->get().Int(); + else if (tok == Token::Id("alloc_thread_block_size") && cin->trySymbol("=")) + alloc_thread_block_size = cin->get().Int(); + else if (tok == Token::Id("alloc_single_thread_alloc") && cin->trySymbol("=")) + alloc_single_thread_alloc = cin->get().Int(); + + cin->trySymbol(","); // optional , separator + } + } + + bool State::verbosity(size_t N) { + return N <= verbose; + } + + void State::print() + { + std::cout << "general:" << std::endl; + std::cout << " build threads = " << numThreads << std::endl; + std::cout << " build user threads = " << numUserThreads << std::endl; + std::cout << " start_threads = " << start_threads << std::endl; + std::cout << " affinity = " << set_affinity << std::endl; + std::cout << " frequency_level = "; + switch (frequency_level) { + case FREQUENCY_SIMD128: std::cout << "simd128" << std::endl; break; + case FREQUENCY_SIMD256: std::cout << "simd256" << std::endl; break; + case FREQUENCY_SIMD512: std::cout << "simd512" << std::endl; break; + default: std::cout << "error" << std::endl; break; + } + + std::cout << " hugepages = "; + if (!hugepages) std::cout << "disabled" << std::endl; + else if (hugepages_success) std::cout << "enabled" << std::endl; + else std::cout << "failed" << std::endl; + + std::cout << " verbosity = " << verbose << std::endl; + std::cout << " cache_size = " << float(tessellation_cache_size)*1E-6 << " MB" << std::endl; + std::cout << " max_spatial_split_replications = " << max_spatial_split_replications << std::endl; + + std::cout << "triangles:" << std::endl; + std::cout << " accel = " << tri_accel << std::endl; + std::cout << " builder = " << tri_builder << std::endl; + std::cout << " traverser = " << tri_traverser << std::endl; + + std::cout << "motion blur triangles:" << std::endl; + std::cout << " accel = " << tri_accel_mb << std::endl; + std::cout << " builder = " << tri_builder_mb << std::endl; + std::cout << " traverser = " << tri_traverser_mb << std::endl; + + std::cout << "quads:" << std::endl; + std::cout << " accel = " << quad_accel << std::endl; + std::cout << " builder = " << quad_builder << std::endl; + std::cout << " traverser = " << quad_traverser << std::endl; + + std::cout << "motion blur quads:" << std::endl; + std::cout << " accel = " << quad_accel_mb << std::endl; + std::cout << " builder = " << quad_builder_mb << std::endl; + std::cout << " traverser = " << quad_traverser_mb << std::endl; + + std::cout << "line segments:" << std::endl; + std::cout << " accel = " << line_accel << std::endl; + std::cout << " builder = " << line_builder << std::endl; + std::cout << " traverser = " << line_traverser << std::endl; + + std::cout << "motion blur line segments:" << std::endl; + std::cout << " accel = " << line_accel_mb << std::endl; + std::cout << " builder = " << line_builder_mb << std::endl; + std::cout << " traverser = " << line_traverser_mb << std::endl; + + std::cout << "hair:" << std::endl; + std::cout << " accel = " << hair_accel << std::endl; + std::cout << " builder = " << hair_builder << std::endl; + std::cout << " traverser = " << hair_traverser << std::endl; + + std::cout << "motion blur hair:" << std::endl; + std::cout << " accel = " << hair_accel_mb << std::endl; + std::cout << " builder = " << hair_builder_mb << std::endl; + std::cout << " traverser = " << hair_traverser_mb << std::endl; + + std::cout << "subdivision surfaces:" << std::endl; + std::cout << " accel = " << subdiv_accel << std::endl; + + std::cout << "grids:" << std::endl; + std::cout << " accel = " << grid_accel << std::endl; + std::cout << " builder = " << grid_builder << std::endl; + + std::cout << "motion blur grids:" << std::endl; + std::cout << " accel = " << grid_accel_mb << std::endl; + std::cout << " builder = " << grid_builder_mb << std::endl; + + std::cout << "object_accel:" << std::endl; + std::cout << " min_leaf_size = " << object_accel_min_leaf_size << std::endl; + std::cout << " max_leaf_size = " << object_accel_max_leaf_size << std::endl; + + std::cout << "object_accel_mb:" << std::endl; + std::cout << " min_leaf_size = " << object_accel_mb_min_leaf_size << std::endl; + std::cout << " max_leaf_size = " << object_accel_mb_max_leaf_size << std::endl; + } +} diff --git a/thirdparty/embree-aarch64/kernels/common/state.h b/thirdparty/embree-aarch64/kernels/common/state.h new file mode 100644 index 00000000000..d0fccc023f8 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/state.h @@ -0,0 +1,197 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "default.h" + +namespace embree +{ + /* mutex to make printing to cout thread safe */ + extern MutexSys g_printMutex; + + struct State : public RefCount + { + public: + /*! state construction */ + State (); + + /*! state destruction */ + ~State(); + + /*! verifies that state is correct */ + void verify(); + + /*! parses state from a configuration file */ + bool parseFile(const FileName& fileName); + + /*! parses the state from a string */ + void parseString(const char* cfg); + + /*! parses the state from a stream */ + void parse(Ref cin); + + /*! prints the state */ + void print(); + + /*! checks if verbosity level is at least N */ + bool verbosity(size_t N); + + /*! checks if some particular ISA is enabled */ + bool hasISA(const int isa); + + /*! check whether selected ISA is supported by the HW */ + bool checkISASupport(); + + public: + std::string tri_accel; //!< acceleration structure to use for triangles + std::string tri_builder; //!< builder to use for triangles + std::string tri_traverser; //!< traverser to use for triangles + + public: + std::string tri_accel_mb; //!< acceleration structure to use for motion blur triangles + std::string tri_builder_mb; //!< builder to use for motion blur triangles + std::string tri_traverser_mb; //!< traverser to use for triangles + + public: + std::string quad_accel; //!< acceleration structure to use for quads + std::string quad_builder; //!< builder to use for quads + std::string quad_traverser; //!< traverser to use for quads + + public: + std::string quad_accel_mb; //!< acceleration structure to use for motion blur quads + std::string quad_builder_mb; //!< builder to use for motion blur quads + std::string quad_traverser_mb; //!< traverser to use for motion blur quads + + public: + std::string line_accel; //!< acceleration structure to use for line segments + std::string line_builder; //!< builder to use for line segments + std::string line_traverser; //!< traverser to use for line segments + + public: + std::string line_accel_mb; //!< acceleration structure to use for motion blur line segments + std::string line_builder_mb; //!< builder to use for motion blur line segments + std::string line_traverser_mb; //!< traverser to use for motion blur line segments + + public: + std::string hair_accel; //!< hair acceleration structure to use + std::string hair_builder; //!< builder to use for hair + std::string hair_traverser; //!< traverser to use for hair + + public: + std::string hair_accel_mb; //!< acceleration structure to use for motion blur hair + std::string hair_builder_mb; //!< builder to use for motion blur hair + std::string hair_traverser_mb; //!< traverser to use for motion blur hair + + public: + std::string object_accel; //!< acceleration structure for user geometries + std::string object_builder; //!< builder for user geometries + int object_accel_min_leaf_size; //!< minimum leaf size for object acceleration structure + int object_accel_max_leaf_size; //!< maximum leaf size for object acceleration structure + + public: + std::string object_accel_mb; //!< acceleration structure for user geometries + std::string object_builder_mb; //!< builder for user geometries + int object_accel_mb_min_leaf_size; //!< minimum leaf size for mblur object acceleration structure + int object_accel_mb_max_leaf_size; //!< maximum leaf size for mblur object acceleration structure + + public: + std::string subdiv_accel; //!< acceleration structure to use for subdivision surfaces + std::string subdiv_accel_mb; //!< acceleration structure to use for subdivision surfaces + + public: + std::string grid_accel; //!< acceleration structure to use for grids + std::string grid_builder; //!< builder for grids + std::string grid_accel_mb; //!< acceleration structure to use for motion blur grids + std::string grid_builder_mb; //!< builder for motion blur grids + + public: + float max_spatial_split_replications; //!< maximally replications*N many primitives in accel for spatial splits + bool useSpatialPreSplits; //!< use spatial pre-splits instead of the full spatial split builder + size_t tessellation_cache_size; //!< size of the shared tessellation cache + + public: + size_t instancing_open_min; //!< instancing opens tree to minimally that number of subtrees + size_t instancing_block_size; //!< instancing opens tree up to average block size of primitives + float instancing_open_factor; //!< instancing opens tree up to x times the number of instances + size_t instancing_open_max_depth; //!< maximum open depth for geometries + size_t instancing_open_max; //!< instancing opens tree to maximally that number of subtrees + + public: + bool ignore_config_files; //!< if true no more config files get parse + bool float_exceptions; //!< enable floating point exceptions + int quality_flags; + int scene_flags; + size_t verbose; //!< verbosity of output + size_t benchmark; //!< true + + public: + size_t numThreads; //!< number of threads to use in builders + size_t numUserThreads; //!< number of user provided threads to use in builders + bool set_affinity; //!< sets affinity for worker threads + bool start_threads; //!< true when threads should be started at device creation time + int enabled_cpu_features; //!< CPU ISA features to use + int enabled_builder_cpu_features; //!< CPU ISA features to use for builders only + enum FREQUENCY_LEVEL { + FREQUENCY_SIMD128, + FREQUENCY_SIMD256, + FREQUENCY_SIMD512 + } frequency_level; //!< frequency level the app wants to run on (default is SIMD256) + bool enable_selockmemoryprivilege; //!< configures the SeLockMemoryPrivilege under Windows to enable huge pages + bool hugepages; //!< true if huge pages should get used + bool hugepages_success; //!< status for enabling huge pages + + public: + size_t alloc_main_block_size; //!< main allocation block size (shared between threads) + int alloc_num_main_slots; //!< number of such shared blocks to be used to allocate + size_t alloc_thread_block_size; //!< size of thread local allocator block size + int alloc_single_thread_alloc; //!< in single mode nodes and leaves use same thread local allocator + + public: + + /*! checks if we can use AVX */ + bool canUseAVX() { + return hasISA(AVX) && frequency_level != FREQUENCY_SIMD128; + } + + /*! checks if we can use AVX2 */ + bool canUseAVX2() { + return hasISA(AVX2) && frequency_level != FREQUENCY_SIMD128; + } + + struct ErrorHandler + { + public: + ErrorHandler(); + ~ErrorHandler(); + RTCError* error(); + + public: + tls_t thread_error; + std::vector thread_errors; + MutexSys errors_mutex; + }; + ErrorHandler errorHandler; + static ErrorHandler g_errorHandler; + + public: + void setErrorFunction(RTCErrorFunction fptr, void* uptr) + { + error_function = fptr; + error_function_userptr = uptr; + } + + RTCErrorFunction error_function; + void* error_function_userptr; + + public: + void setMemoryMonitorFunction(RTCMemoryMonitorFunction fptr, void* uptr) + { + memory_monitor_function = fptr; + memory_monitor_userptr = uptr; + } + + RTCMemoryMonitorFunction memory_monitor_function; + void* memory_monitor_userptr; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/common/vector.h b/thirdparty/embree-aarch64/kernels/common/vector.h new file mode 100644 index 00000000000..b4787622403 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/common/vector.h @@ -0,0 +1,76 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "default.h" + +namespace embree +{ + /*! invokes the memory monitor callback */ + struct MemoryMonitorInterface { + virtual void memoryMonitor(ssize_t bytes, bool post) = 0; + }; + + /*! allocator that performs aligned monitored allocations */ + template + struct aligned_monitored_allocator + { + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + __forceinline aligned_monitored_allocator(MemoryMonitorInterface* device) + : device(device), hugepages(false) {} + + __forceinline pointer allocate( size_type n ) + { + if (n) { + assert(device); + device->memoryMonitor(n*sizeof(T),false); + } + if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M) + { + pointer p = (pointer) os_malloc(n*sizeof(value_type),hugepages); + assert(p); + return p; + } + return (pointer) alignedMalloc(n*sizeof(value_type),alignment); + } + + __forceinline void deallocate( pointer p, size_type n ) + { + if (p) + { + if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M) + os_free(p,n*sizeof(value_type),hugepages); + else + alignedFree(p); + } + else assert(n == 0); + + if (n) { + assert(device); + device->memoryMonitor(-ssize_t(n)*sizeof(T),true); + } + } + + __forceinline void construct( pointer p, const_reference val ) { + new (p) T(val); + } + + __forceinline void destroy( pointer p ) { + p->~T(); + } + + private: + MemoryMonitorInterface* device; + bool hugepages; + }; + + /*! monitored vector */ + template + using mvector = vector_t::value> >; +} diff --git a/thirdparty/embree-aarch64/kernels/config.h b/thirdparty/embree-aarch64/kernels/config.h new file mode 100644 index 00000000000..80a8ab2a564 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/config.h @@ -0,0 +1,76 @@ + +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +/* #undef EMBREE_RAY_MASK */ +/* #undef EMBREE_STAT_COUNTERS */ +/* #undef EMBREE_BACKFACE_CULLING */ +/* #undef EMBREE_BACKFACE_CULLING_CURVES */ +#define EMBREE_FILTER_FUNCTION +/* #undef EMBREE_IGNORE_INVALID_RAYS */ +#define EMBREE_GEOMETRY_TRIANGLE +/* #undef EMBREE_GEOMETRY_QUAD */ +/* #undef EMBREE_GEOMETRY_CURVE */ +/* #undef EMBREE_GEOMETRY_SUBDIVISION */ +/* #undef EMBREE_GEOMETRY_USER */ +/* #undef EMBREE_GEOMETRY_INSTANCE */ +/* #undef EMBREE_GEOMETRY_GRID */ +/* #undef EMBREE_GEOMETRY_POINT */ +/* #undef EMBREE_RAY_PACKETS */ +/* #undef EMBREE_COMPACT_POLYS */ + +#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0 + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + #define IF_ENABLED_TRIS(x) x +#else + #define IF_ENABLED_TRIS(x) +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + #define IF_ENABLED_QUADS(x) x +#else + #define IF_ENABLED_QUADS(x) +#endif + +#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT) + #define IF_ENABLED_CURVES_OR_POINTS(x) x +#else + #define IF_ENABLED_CURVES_OR_POINTS(x) +#endif + +#if defined(EMBREE_GEOMETRY_CURVE) + #define IF_ENABLED_CURVES(x) x +#else + #define IF_ENABLED_CURVES(x) +#endif + +#if defined(EMBREE_GEOMETRY_POINT) + #define IF_ENABLED_POINTS(x) x +#else + #define IF_ENABLED_POINTS(x) +#endif + +#if defined(EMBREE_GEOMETRY_SUBDIVISION) + #define IF_ENABLED_SUBDIV(x) x +#else + #define IF_ENABLED_SUBDIV(x) +#endif + +#if defined(EMBREE_GEOMETRY_USER) + #define IF_ENABLED_USER(x) x +#else + #define IF_ENABLED_USER(x) +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + #define IF_ENABLED_INSTANCE(x) x +#else + #define IF_ENABLED_INSTANCE(x) +#endif + +#if defined(EMBREE_GEOMETRY_GRID) + #define IF_ENABLED_GRIDS(x) x +#else + #define IF_ENABLED_GRIDS(x) +#endif diff --git a/thirdparty/embree-aarch64/kernels/geometry/cone.h b/thirdparty/embree-aarch64/kernels/geometry/cone.h new file mode 100644 index 00000000000..961ef86160a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/cone.h @@ -0,0 +1,321 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" + +namespace embree +{ + namespace isa + { + struct Cone + { + const Vec3fa p0; //!< start position of cone + const Vec3fa p1; //!< end position of cone + const float r0; //!< start radius of cone + const float r1; //!< end radius of cone + + __forceinline Cone(const Vec3fa& p0, const float r0, const Vec3fa& p1, const float r1) + : p0(p0), p1(p1), r0(r0), r1(r1) {} + + __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, + BBox1f& t_o, + float& u0_o, Vec3fa& Ng0_o, + float& u1_o, Vec3fa& Ng1_o) const + { + /* calculate quadratic equation to solve */ + const Vec3fa v0 = p0-org; + const Vec3fa v1 = p1-org; + + const float rl = rcp_length(v1-v0); + const Vec3fa P0 = v0, dP = (v1-v0)*rl; + const float dr = (r1-r0)*rl; + const Vec3fa O = -P0, dO = dir; + + const float dOdO = dot(dO,dO); + const float OdO = dot(dO,O); + const float OO = dot(O,O); + const float dOz = dot(dP,dO); + const float Oz = dot(dP,O); + + const float R = r0 + Oz*dr; + const float A = dOdO - sqr(dOz) * (1.0f+sqr(dr)); + const float B = 2.0f * (OdO - dOz*(Oz + R*dr)); + const float C = OO - (sqr(Oz) + sqr(R)); + + /* we miss the cone if determinant is smaller than zero */ + const float D = B*B - 4.0f*A*C; + if (D < 0.0f) return false; + + /* special case for rays that are "parallel" to the cone */ + const float eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz))); + if (unlikely(abs(A) < eps)) + { + /* cylinder case */ + if (abs(dr) < 16.0f*float(ulp)) { + if (C <= 0.0f) { t_o = BBox1f(neg_inf,pos_inf); return true; } + else { t_o = BBox1f(pos_inf,neg_inf); return false; } + } + + /* cone case */ + else + { + /* if we hit the negative cone there cannot be a hit */ + const float t = -C/B; + const float z0 = Oz+t*dOz; + const float z0r = r0+z0*dr; + if (z0r < 0.0f) return false; + + /* test if we start inside or outside the cone */ + if (dOz*dr > 0.0f) t_o = BBox1f(t,pos_inf); + else t_o = BBox1f(neg_inf,t); + } + } + + /* standard case for "non-parallel" rays */ + else + { + const float Q = sqrt(D); + const float rcp_2A = rcp(2.0f*A); + t_o.lower = (-B-Q)*rcp_2A; + t_o.upper = (-B+Q)*rcp_2A; + + /* standard case where both hits are on same cone */ + if (likely(A > 0.0f)) { + const float z0 = Oz+t_o.lower*dOz; + const float z0r = r0+z0*dr; + if (z0r < 0.0f) return false; + } + + /* special case where the hits are on the positive and negative cone */ + else + { + /* depending on the ray direction and the open direction + * of the cone we have a hit from inside or outside the + * cone */ + if (dOz*dr > 0) t_o.upper = pos_inf; + else t_o.lower = neg_inf; + } + } + + /* calculates u and Ng for near hit */ + { + u0_o = (Oz+t_o.lower*dOz)*rl; + const Vec3fa Pr = t_o.lower*dir; + const Vec3fa Pl = v0 + u0_o*(v1-v0); + const Vec3fa R = normalize(Pr-Pl); + const Vec3fa U = (p1-p0)+(r1-r0)*R; + const Vec3fa V = cross(p1-p0,R); + Ng0_o = cross(V,U); + } + + /* calculates u and Ng for far hit */ + { + u1_o = (Oz+t_o.upper*dOz)*rl; + const Vec3fa Pr = t_o.upper*dir; + const Vec3fa Pl = v0 + u1_o*(v1-v0); + const Vec3fa R = normalize(Pr-Pl); + const Vec3fa U = (p1-p0)+(r1-r0)*R; + const Vec3fa V = cross(p1-p0,R); + Ng1_o = cross(V,U); + } + return true; + } + + __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, BBox1f& t_o) const + { + float u0_o; Vec3fa Ng0_o; float u1_o; Vec3fa Ng1_o; + return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o); + } + + static bool verify(const size_t id, const Cone& cone, const Ray& ray, bool shouldhit, const float t0, const float t1) + { + float eps = 0.001f; + BBox1f t; bool hit; + hit = cone.intersect(ray.org,ray.dir,t); + + bool failed = hit != shouldhit; + if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : (t0 == -1E6) ? t.lower > -1E6f : abs(t0-t.lower) > eps; + if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : (t1 == +1E6) ? t.upper < +1E6f : abs(t1-t.upper) > eps; + if (!failed) return true; + embree_cout << "Cone test " << id << " failed: cone = " << cone << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl; + return false; + } + + /* verify cone class */ + static bool verify() + { + bool passed = true; + const Cone cone0(Vec3fa(0.0f,0.0f,0.0f),0.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f); + passed &= verify(0,cone0,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,3.0f,pos_inf); + passed &= verify(1,cone0,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f); + passed &= verify(2,cone0,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),false,0.0f,0.0f); + passed &= verify(3,cone0,Ray(Vec3fa(+1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,3.0f); + passed &= verify(4,cone0,Ray(Vec3fa(-1.0f,0.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,1.0f,pos_inf); + passed &= verify(5,cone0,Ray(Vec3fa(+1.0f,0.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f); + passed &= verify(6,cone0,Ray(Vec3fa(+0.0f,0.0f,1.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,1.0f); + passed &= verify(7,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f); + passed &= verify(8,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(+1.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.5f,+1E6); + passed &= verify(9,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,+1.0f,+0.0f),0.0f,float(inf)),true,-1E6,-0.5f); + const Cone cone1(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),0.0f); + passed &= verify(10,cone1,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,2.0f); + passed &= verify(11,cone1,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,0.0f,4.0f); + const Cone cylinder(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f); + passed &= verify(12,cylinder,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f); + passed &= verify(13,cylinder,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f); + passed &= verify(14,cylinder,Ray(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f); + passed &= verify(15,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf); + passed &= verify(16,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf); + passed &= verify(17,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf); + passed &= verify(18,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf); + return passed; + } + + /*! output operator */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cone& c) { + return cout << "Cone { p0 = " << c.p0 << ", r0 = " << c.r0 << ", p1 = " << c.p1 << ", r1 = " << c.r1 << "}"; + } + }; + + template + struct ConeN + { + typedef Vec3> Vec3vfN; + + const Vec3vfN p0; //!< start position of cone + const Vec3vfN p1; //!< end position of cone + const vfloat r0; //!< start radius of cone + const vfloat r1; //!< end radius of cone + + __forceinline ConeN(const Vec3vfN& p0, const vfloat& r0, const Vec3vfN& p1, const vfloat& r1) + : p0(p0), p1(p1), r0(r0), r1(r1) {} + + __forceinline Cone operator[] (const size_t i) const + { + assert(i intersect(const Vec3fa& org, const Vec3fa& dir, + BBox>& t_o, + vfloat& u0_o, Vec3vfN& Ng0_o, + vfloat& u1_o, Vec3vfN& Ng1_o) const + { + /* calculate quadratic equation to solve */ + const Vec3vfN v0 = p0-Vec3vfN(org); + const Vec3vfN v1 = p1-Vec3vfN(org); + + const vfloat rl = rcp_length(v1-v0); + const Vec3vfN P0 = v0, dP = (v1-v0)*rl; + const vfloat dr = (r1-r0)*rl; + const Vec3vfN O = -P0, dO = dir; + + const vfloat dOdO = dot(dO,dO); + const vfloat OdO = dot(dO,O); + const vfloat OO = dot(O,O); + const vfloat dOz = dot(dP,dO); + const vfloat Oz = dot(dP,O); + + const vfloat R = r0 + Oz*dr; + const vfloat A = dOdO - sqr(dOz) * (vfloat(1.0f)+sqr(dr)); + const vfloat B = 2.0f * (OdO - dOz*(Oz + R*dr)); + const vfloat C = OO - (sqr(Oz) + sqr(R)); + + /* we miss the cone if determinant is smaller than zero */ + const vfloat D = B*B - 4.0f*A*C; + vbool valid = D >= 0.0f; + if (none(valid)) return valid; + + /* special case for rays that are "parallel" to the cone */ + const vfloat eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz))); + const vbool validt = valid & (abs(A) < eps); + const vbool validf = valid & !(abs(A) < eps); + if (unlikely(any(validt))) + { + const vboolx validtt = validt & (abs(dr) < 16.0f*float(ulp)); + const vboolx validtf = validt & (abs(dr) >= 16.0f*float(ulp)); + + /* cylinder case */ + if (unlikely(any(validtt))) + { + t_o.lower = select(validtt, select(C <= 0.0f, vfloat(neg_inf), vfloat(pos_inf)), t_o.lower); + t_o.upper = select(validtt, select(C <= 0.0f, vfloat(pos_inf), vfloat(neg_inf)), t_o.upper); + valid &= !validtt | C <= 0.0f; + } + + /* cone case */ + if (any(validtf)) + { + /* if we hit the negative cone there cannot be a hit */ + const vfloat t = -C/B; + const vfloat z0 = Oz+t*dOz; + const vfloat z0r = r0+z0*dr; + valid &= !validtf | z0r >= 0.0f; + + /* test if we start inside or outside the cone */ + t_o.lower = select(validtf, select(dOz*dr > 0.0f, t, vfloat(neg_inf)), t_o.lower); + t_o.upper = select(validtf, select(dOz*dr > 0.0f, vfloat(pos_inf), t), t_o.upper); + } + } + + /* standard case for "non-parallel" rays */ + if (likely(any(validf))) + { + const vfloat Q = sqrt(D); + const vfloat rcp_2A = 0.5f*rcp(A); + t_o.lower = select(validf, (-B-Q)*rcp_2A, t_o.lower); + t_o.upper = select(validf, (-B+Q)*rcp_2A, t_o.upper); + + /* standard case where both hits are on same cone */ + const vbool validft = validf & A>0.0f; + const vbool validff = validf & !(A>0.0f); + if (any(validft)) { + const vfloat z0 = Oz+t_o.lower*dOz; + const vfloat z0r = r0+z0*dr; + valid &= !validft | z0r >= 0.0f; + } + + /* special case where the hits are on the positive and negative cone */ + if (any(validff)) { + /* depending on the ray direction and the open direction + * of the cone we have a hit from inside or outside the + * cone */ + t_o.lower = select(validff, select(dOz*dr > 0.0f, t_o.lower, float(neg_inf)), t_o.lower); + t_o.upper = select(validff, select(dOz*dr > 0.0f, float(pos_inf), t_o.upper), t_o.upper); + } + } + + /* calculates u and Ng for near hit */ + { + u0_o = (Oz+t_o.lower*dOz)*rl; + const Vec3vfN Pr = t_o.lower*Vec3vfN(dir); + const Vec3vfN Pl = v0 + u0_o*(v1-v0); + const Vec3vfN R = normalize(Pr-Pl); + const Vec3vfN U = (p1-p0)+(r1-r0)*R; + const Vec3vfN V = cross(p1-p0,R); + Ng0_o = cross(V,U); + } + + /* calculates u and Ng for far hit */ + { + u1_o = (Oz+t_o.upper*dOz)*rl; + const Vec3vfN Pr = t_o.lower*Vec3vfN(dir); + const Vec3vfN Pl = v0 + u1_o*(v1-v0); + const Vec3vfN R = normalize(Pr-Pl); + const Vec3vfN U = (p1-p0)+(r1-r0)*R; + const Vec3vfN V = cross(p1-p0,R); + Ng1_o = cross(V,U); + } + return valid; + } + + __forceinline vbool intersect(const Vec3fa& org, const Vec3fa& dir, BBox>& t_o) const + { + vfloat u0_o; Vec3vfN Ng0_o; vfloat u1_o; Vec3vfN Ng1_o; + return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o); + } + }; + } +} + diff --git a/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h new file mode 100644 index 00000000000..0902baff7db --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h @@ -0,0 +1,209 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + namespace isa + { + namespace __coneline_internal + { + template + static __forceinline bool intersectCone(const vbool& valid_i, + const Vec3vf& ray_org_in, const Vec3vf& ray_dir, + const vfloat& ray_tnear, const ray_tfar_func& ray_tfar, + const Vec4vf& v0, const Vec4vf& v1, + const vbool& cL, const vbool& cR, + const Epilog& epilog) + { + vbool valid = valid_i; + + /* move ray origin closer to make calculations numerically stable */ + const vfloat dOdO = sqr(ray_dir); + const vfloat rcp_dOdO = rcp(dOdO); + const Vec3vf center = vfloat(0.5f)*(v0.xyz()+v1.xyz()); + const vfloat dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO; + const Vec3vf ray_org = ray_org_in + dt*ray_dir; + + const Vec3vf dP = v1.xyz() - v0.xyz(); + const Vec3vf p0 = ray_org - v0.xyz(); + const Vec3vf p1 = ray_org - v1.xyz(); + + const vfloat dPdP = sqr(dP); + const vfloat dP0 = dot(p0,dP); + const vfloat dP1 = dot(p1,dP); + const vfloat dOdP = dot(ray_dir,dP); + + // intersect cone body + const vfloat dr = v0.w - v1.w; + const vfloat hy = dPdP + sqr(dr); + const vfloat dO0 = dot(ray_dir,p0); + const vfloat OO = sqr(p0); + const vfloat dPdP2 = sqr(dPdP); + const vfloat dPdPr0 = dPdP*v0.w; + + const vfloat A = dPdP2 - sqr(dOdP)*hy; + const vfloat B = dPdP2*dO0 - dP0*dOdP*hy + dPdPr0*(dr*dOdP); + const vfloat C = dPdP2*OO - sqr(dP0)*hy + dPdPr0*(2.0f*dr*dP0 - dPdPr0); + + const vfloat D = B*B - A*C; + valid &= D >= 0.0f; + if (unlikely(none(valid))) { + return false; + } + + /* standard case for "non-parallel" rays */ + const vfloat Q = sqrt(D); + const vfloat rcp_A = rcp(A); + /* special case for rays that are "parallel" to the cone - assume miss */ + const vbool isParallel = abs(A) <= min_rcp_input; + + vfloat t_cone_lower = select (isParallel, neg_inf, (-B-Q)*rcp_A); + vfloat t_cone_upper = select (isParallel, pos_inf, (-B+Q)*rcp_A); + const vfloat y_lower = dP0 + t_cone_lower*dOdP; + const vfloat y_upper = dP0 + t_cone_upper*dOdP; + t_cone_lower = select(valid & y_lower > 0.0f & y_lower < dPdP, t_cone_lower, pos_inf); + t_cone_upper = select(valid & y_upper > 0.0f & y_upper < dPdP, t_cone_upper, neg_inf); + + const vbool hitDisk0 = valid & cL; + const vbool hitDisk1 = valid & cR; + const vfloat rcp_dOdP = rcp(dOdP); + const vfloat t_disk0 = select (hitDisk0, select (sqr(p0*dOdP-ray_dir*dP0)<(sqr(v0.w)*sqr(dOdP)), -dP0*rcp_dOdP, pos_inf), pos_inf); + const vfloat t_disk1 = select (hitDisk1, select (sqr(p1*dOdP-ray_dir*dP1)<(sqr(v1.w)*sqr(dOdP)), -dP1*rcp_dOdP, pos_inf), pos_inf); + const vfloat t_disk_lower = min(t_disk0, t_disk1); + const vfloat t_disk_upper = max(t_disk0, t_disk1); + + const vfloat t_lower = min(t_cone_lower, t_disk_lower); + const vfloat t_upper = max(t_cone_upper, select(t_lower==t_disk_lower, + select(t_disk_upper==vfloat(pos_inf),neg_inf,t_disk_upper), + select(t_disk_lower==vfloat(pos_inf),neg_inf,t_disk_lower))); + + const vbool valid_lower = valid & ray_tnear <= dt+t_lower & dt+t_lower <= ray_tfar() & t_lower != vfloat(pos_inf); + const vbool valid_upper = valid & ray_tnear <= dt+t_upper & dt+t_upper <= ray_tfar() & t_upper != vfloat(neg_inf); + + const vbool valid_first = valid_lower | valid_upper; + if (unlikely(none(valid_first))) + return false; + + const vfloat t_first = select(valid_lower, t_lower, t_upper); + const vfloat y_first = select(valid_lower, y_lower, y_upper); + + const vfloat rcp_dPdP = rcp(dPdP); + const Vec3vf dP2drr0dP = dPdP*dr*v0.w*dP; + const Vec3vf dPhy = dP*hy; + const vbool cone_hit_first = valid & (t_first == t_cone_lower | t_first == t_cone_upper); + const vbool disk0_hit_first = valid & (t_first == t_disk0); + const Vec3vf Ng_first = select(cone_hit_first, dPdP2*(p0+t_first*ray_dir)+dP2drr0dP-dPhy*y_first, select(disk0_hit_first, -dP, dP)); + const vfloat u_first = select(cone_hit_first, y_first*rcp_dPdP, select(disk0_hit_first, vfloat(zero), vfloat(one))); + + /* invoke intersection filter for first hit */ + RoundLineIntersectorHitM hit(u_first,zero,dt+t_first,Ng_first); + const bool is_hit_first = epilog(valid_first, hit); + + /* check for possible second hits before potentially accepted hit */ + const vfloat t_second = t_upper; + const vfloat y_second = y_upper; + const vbool valid_second = valid_lower & valid_upper & (dt+t_upper <= ray_tfar()); + if (unlikely(none(valid_second))) + return is_hit_first; + + /* invoke intersection filter for second hit */ + const vbool cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper; + const vbool disk0_hit_second = t_second == t_disk0; + const Vec3vf Ng_second = select(cone_hit_second, dPdP2*(p0+t_second*ray_dir)+dP2drr0dP-dPhy*y_second, select(disk0_hit_second, -dP, dP)); + const vfloat u_second = select(cone_hit_second, y_second*rcp_dPdP, select(disk0_hit_first, vfloat(zero), vfloat(one))); + + hit = RoundLineIntersectorHitM(u_second,zero,dt+t_second,Ng_second); + const bool is_hit_second = epilog(valid_second, hit); + + return is_hit_first | is_hit_second; + } + } + + template + struct ConeLineIntersectorHitM + { + __forceinline ConeLineIntersectorHitM() {} + + __forceinline ConeLineIntersectorHitM(const vfloat& u, const vfloat& v, const vfloat& t, const Vec3vf& Ng) + : vu(u), vv(v), vt(t), vNg(Ng) {} + + __forceinline void finalize() {} + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + public: + vfloat vu; + vfloat vv; + vfloat vt; + Vec3vf vNg; + }; + + template + struct ConeCurveIntersector1 + { + typedef CurvePrecalculations1 Precalculations; + + struct ray_tfar { + Ray& ray; + __forceinline ray_tfar(Ray& ray) : ray(ray) {} + __forceinline vfloat operator() () const { return ray.tfar; }; + }; + + template + static __forceinline bool intersect(const vbool& valid_i, + Ray& ray, + IntersectContext* context, + const LineSegments* geom, + const Precalculations& pre, + const Vec4vf& v0i, const Vec4vf& v1i, + const vbool& cL, const vbool& cR, + const Epilog& epilog) + { + const Vec3vf ray_org(ray.org.x, ray.org.y, ray.org.z); + const Vec3vf ray_dir(ray.dir.x, ray.dir.y, ray.dir.z); + const vfloat ray_tnear(ray.tnear()); + const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); + const Vec4vf v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i); + return __coneline_internal::intersectCone(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray),v0,v1,cL,cR,epilog); + } + }; + + template + struct ConeCurveIntersectorK + { + typedef CurvePrecalculationsK Precalculations; + + struct ray_tfar { + RayK& ray; + size_t k; + __forceinline ray_tfar(RayK& ray, size_t k) : ray(ray), k(k) {} + __forceinline vfloat operator() () const { return ray.tfar[k]; }; + }; + + template + static __forceinline bool intersect(const vbool& valid_i, + RayK& ray, size_t k, + IntersectContext* context, + const LineSegments* geom, + const Precalculations& pre, + const Vec4vf& v0i, const Vec4vf& v1i, + const vbool& cL, const vbool& cR, + const Epilog& epilog) + { + const Vec3vf ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const Vec3vf ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); + const vfloat ray_tnear = ray.tnear()[k]; + const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); + const Vec4vf v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i); + return __coneline_internal::intersectCone(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,cL,cR,epilog); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h new file mode 100644 index 00000000000..d47218eb8b7 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h @@ -0,0 +1,141 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "coneline_intersector.h" +#include "intersector_epilog.h" + +namespace embree +{ + namespace isa + { + template + struct ConeCurveMiIntersector1 + { + typedef LineMi Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; + vbool cL,cR; + line.gather(v0,v1,cL,cR,geom); + const vbool valid = line.template valid(); + ConeCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; + vbool cL,cR; + line.gather(v0,v1,cL,cR,geom); + const vbool valid = line.template valid(); + return ConeCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM(ray,context,line.geomID(),line.primID())); + return false; + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) + { + return PrimitivePointQuery1::pointQuery(query, context, line); + } + }; + + template + struct ConeCurveMiMBIntersector1 + { + typedef LineMi Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; + vbool cL,cR; + line.gather(v0,v1,cL,cR,geom,ray.time()); + const vbool valid = line.template valid(); + ConeCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; + vbool cL,cR; + line.gather(v0,v1,cL,cR,geom,ray.time()); + const vbool valid = line.template valid(); + return ConeCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM(ray,context,line.geomID(),line.primID())); + return false; + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) + { + return PrimitivePointQuery1::pointQuery(query, context, line); + } + }; + + template + struct ConeCurveMiIntersectorK + { + typedef LineMi Primitive; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; + vbool cL,cR; + line.gather(v0,v1,cL,cR,geom); + const vbool valid = line.template valid(); + ConeCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM(ray,k,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; + vbool cL,cR; + line.gather(v0,v1,cL,cR,geom); + const vbool valid = line.template valid(); + return ConeCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM(ray,k,context,line.geomID(),line.primID())); + } + }; + + template + struct ConeCurveMiMBIntersectorK + { + typedef LineMi Primitive; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; + vbool cL,cR; + line.gather(v0,v1,cL,cR,geom,ray.time()[k]); + const vbool valid = line.template valid(); + ConeCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM(ray,k,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; + vbool cL,cR; + line.gather(v0,v1,cL,cR,geom,ray.time()[k]); + const vbool valid = line.template valid(); + return ConeCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM(ray,k,context,line.geomID(),line.primID())); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi.h new file mode 100644 index 00000000000..51384f19590 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi.h @@ -0,0 +1,222 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + template + struct CurveNi + { + struct Type : public PrimitiveType { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* Returns maximum number of stored primitives */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; } + + static __forceinline size_t bytes(size_t N) + { + const size_t f = N/M, r = N%M; + static_assert(sizeof(CurveNi) == 22+25*M, "internal data layout issue"); + return f*sizeof(CurveNi) + (r!=0)*(22 + 25*r); + } + + public: + + /*! Default constructor. */ + __forceinline CurveNi () {} + + /*! fill curve from curve list */ + __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene) + { + size_t end = min(begin+M,_end); + N = (uint8_t)(end-begin); + const unsigned int geomID0 = prims[begin].geomID(); + this->geomID(N) = geomID0; + ty = (uint8_t) scene->get(geomID0)->getType(); + + /* encode all primitives */ + BBox3fa bounds = empty; + for (size_t i=0; iget(geomID)->vbounds(primID)); + } + + /* calculate offset and scale */ + Vec3fa loffset = bounds.lower; + float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f))); + if (bounds.size() == Vec3fa(zero)) lscale = 0.0f; + *this->offset(N) = loffset; + *this->scale(N) = lscale; + + /* encode all primitives */ + for (size_t i=0; iget(geomID)->computeAlignedSpace(primID); + + const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz)); + const BBox3fa bounds = scene->get(geomID)->vbounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID); + + bounds_vx_x(N)[i] = (int8_t) space3.vx.x; + bounds_vx_y(N)[i] = (int8_t) space3.vx.y; + bounds_vx_z(N)[i] = (int8_t) space3.vx.z; + bounds_vx_lower(N)[i] = (short) clamp(floor(bounds.lower.x),-32767.0f,32767.0f); + bounds_vx_upper(N)[i] = (short) clamp(ceil (bounds.upper.x),-32767.0f,32767.0f); + assert(-32767.0f <= floor(bounds.lower.x) && floor(bounds.lower.x) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.upper.x) && ceil (bounds.upper.x) <= 32767.0f); + + bounds_vy_x(N)[i] = (int8_t) space3.vy.x; + bounds_vy_y(N)[i] = (int8_t) space3.vy.y; + bounds_vy_z(N)[i] = (int8_t) space3.vy.z; + bounds_vy_lower(N)[i] = (short) clamp(floor(bounds.lower.y),-32767.0f,32767.0f); + bounds_vy_upper(N)[i] = (short) clamp(ceil (bounds.upper.y),-32767.0f,32767.0f); + assert(-32767.0f <= floor(bounds.lower.y) && floor(bounds.lower.y) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.upper.y) && ceil (bounds.upper.y) <= 32767.0f); + + bounds_vz_x(N)[i] = (int8_t) space3.vz.x; + bounds_vz_y(N)[i] = (int8_t) space3.vz.y; + bounds_vz_z(N)[i] = (int8_t) space3.vz.z; + bounds_vz_lower(N)[i] = (short) clamp(floor(bounds.lower.z),-32767.0f,32767.0f); + bounds_vz_upper(N)[i] = (short) clamp(ceil (bounds.upper.z),-32767.0f,32767.0f); + assert(-32767.0f <= floor(bounds.lower.z) && floor(bounds.lower.z) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.upper.z) && ceil (bounds.upper.z) <= 32767.0f); + + this->primID(N)[i] = primID; + } + } + + template + __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range& set, const Allocator& alloc) + { + size_t start = set.begin(); + size_t items = CurveNi::blocks(set.size()); + size_t numbytes = CurveNi::bytes(set.size()); + CurveNi* accel = (CurveNi*) alloc.malloc1(numbytes,BVH::byteAlignment); + for (size_t i=0; iscene); + } + return bvh->encodeLeaf((int8_t*)accel,items); + }; + + public: + + // 27.6 - 46 bytes per primitive + uint8_t ty; + uint8_t N; + uint8_t data[4+25*M+16]; + + /* + struct Layout + { + unsigned int geomID; + unsigned int primID[N]; + + int8_t bounds_vx_x[N]; + int8_t bounds_vx_y[N]; + int8_t bounds_vx_z[N]; + short bounds_vx_lower[N]; + short bounds_vx_upper[N]; + + int8_t bounds_vy_x[N]; + int8_t bounds_vy_y[N]; + int8_t bounds_vy_z[N]; + short bounds_vy_lower[N]; + short bounds_vy_upper[N]; + + int8_t bounds_vz_x[N]; + int8_t bounds_vz_y[N]; + int8_t bounds_vz_z[N]; + short bounds_vz_lower[N]; + short bounds_vz_upper[N]; + + Vec3f offset; + float scale; + }; + */ + + __forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((int8_t*)this+2); } + __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); } + + __forceinline unsigned int* primID(size_t N) { return (unsigned int*)((int8_t*)this+6); } + __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); } + + __forceinline int8_t* bounds_vx_x(size_t N) { return (int8_t*)((int8_t*)this+6+4*N); } + __forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); } + + __forceinline int8_t* bounds_vx_y(size_t N) { return (int8_t*)((int8_t*)this+6+5*N); } + __forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); } + + __forceinline int8_t* bounds_vx_z(size_t N) { return (int8_t*)((int8_t*)this+6+6*N); } + __forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); } + + __forceinline short* bounds_vx_lower(size_t N) { return (short*)((int8_t*)this+6+7*N); } + __forceinline const short* bounds_vx_lower(size_t N) const { return (short*)((int8_t*)this+6+7*N); } + + __forceinline short* bounds_vx_upper(size_t N) { return (short*)((int8_t*)this+6+9*N); } + __forceinline const short* bounds_vx_upper(size_t N) const { return (short*)((int8_t*)this+6+9*N); } + + __forceinline int8_t* bounds_vy_x(size_t N) { return (int8_t*)((int8_t*)this+6+11*N); } + __forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+11*N); } + + __forceinline int8_t* bounds_vy_y(size_t N) { return (int8_t*)((int8_t*)this+6+12*N); } + __forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+12*N); } + + __forceinline int8_t* bounds_vy_z(size_t N) { return (int8_t*)((int8_t*)this+6+13*N); } + __forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+13*N); } + + __forceinline short* bounds_vy_lower(size_t N) { return (short*)((int8_t*)this+6+14*N); } + __forceinline const short* bounds_vy_lower(size_t N) const { return (short*)((int8_t*)this+6+14*N); } + + __forceinline short* bounds_vy_upper(size_t N) { return (short*)((int8_t*)this+6+16*N); } + __forceinline const short* bounds_vy_upper(size_t N) const { return (short*)((int8_t*)this+6+16*N); } + + __forceinline int8_t* bounds_vz_x(size_t N) { return (int8_t*)((int8_t*)this+6+18*N); } + __forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+18*N); } + + __forceinline int8_t* bounds_vz_y(size_t N) { return (int8_t*)((int8_t*)this+6+19*N); } + __forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+19*N); } + + __forceinline int8_t* bounds_vz_z(size_t N) { return (int8_t*)((int8_t*)this+6+20*N); } + __forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+20*N); } + + __forceinline short* bounds_vz_lower(size_t N) { return (short*)((int8_t*)this+6+21*N); } + __forceinline const short* bounds_vz_lower(size_t N) const { return (short*)((int8_t*)this+6+21*N); } + + __forceinline short* bounds_vz_upper(size_t N) { return (short*)((int8_t*)this+6+23*N); } + __forceinline const short* bounds_vz_upper(size_t N) const { return (short*)((int8_t*)this+6+23*N); } + + __forceinline Vec3f* offset(size_t N) { return (Vec3f*)((int8_t*)this+6+25*N); } + __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+25*N); } + + __forceinline float* scale(size_t N) { return (float*)((int8_t*)this+6+25*N+12); } + __forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+25*N+12); } + + __forceinline int8_t* end(size_t N) { return (int8_t*)this+6+25*N+16; } + __forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+25*N+16; } + }; + + template + typename CurveNi::Type CurveNi::type; + + typedef CurveNi<4> Curve4i; + typedef CurveNi<8> Curve8i; +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h new file mode 100644 index 00000000000..0f9038c9fc9 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h @@ -0,0 +1,569 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "curveNi.h" + +namespace embree +{ + namespace isa + { + template + struct CurveNiIntersector1 + { + typedef CurveNi Primitive; + typedef Vec3vf Vec3vfM; + typedef LinearSpace3LinearSpace3vfM; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline vbool intersect(Ray& ray, const Primitive& prim, vfloat& tNear_o) + { + const size_t N = prim.N; + const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N)); + const Vec3fa offset = Vec3fa(offset_scale); + const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale)); + const Vec3fa org1 = (ray.org-offset)*scale; + const Vec3fa dir1 = ray.dir*scale; + + const LinearSpace3vfM space(vfloat::load(prim.bounds_vx_x(N)), vfloat::load(prim.bounds_vx_y(N)), vfloat::load(prim.bounds_vx_z(N)), + vfloat::load(prim.bounds_vy_x(N)), vfloat::load(prim.bounds_vy_y(N)), vfloat::load(prim.bounds_vy_z(N)), + vfloat::load(prim.bounds_vz_x(N)), vfloat::load(prim.bounds_vz_y(N)), vfloat::load(prim.bounds_vz_z(N))); + + const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1)); + const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1)); + const Vec3vfM rcp_dir2 = rcp_safe(dir2); + + const vfloat t_lower_x = (vfloat::load(prim.bounds_vx_lower(N))-vfloat(org2.x))*vfloat(rcp_dir2.x); + const vfloat t_upper_x = (vfloat::load(prim.bounds_vx_upper(N))-vfloat(org2.x))*vfloat(rcp_dir2.x); + const vfloat t_lower_y = (vfloat::load(prim.bounds_vy_lower(N))-vfloat(org2.y))*vfloat(rcp_dir2.y); + const vfloat t_upper_y = (vfloat::load(prim.bounds_vy_upper(N))-vfloat(org2.y))*vfloat(rcp_dir2.y); + const vfloat t_lower_z = (vfloat::load(prim.bounds_vz_lower(N))-vfloat(org2.z))*vfloat(rcp_dir2.z); + const vfloat t_upper_z = (vfloat::load(prim.bounds_vz_upper(N))-vfloat(org2.z))*vfloat(rcp_dir2.z); + + const vfloat round_up (1.0f+3.0f*float(ulp)); + const vfloat round_down(1.0f-3.0f*float(ulp)); + const vfloat tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat(ray.tnear())); + const vfloat tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat(ray.tfar)); + tNear_o = tNear; + return (vint(step) < vint(prim.N)) & (tNear <= tFar); + } + + template + static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID)); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + } + + template + static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID)); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + return false; + } + + template + static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + + unsigned int vertexID = geom->curve(primID); + Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + } + + template + static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + + unsigned int vertexID = geom->curve(primID); + Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + return false; + } + + template + static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID)); + Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + } + + template + static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID)); + if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + return false; + } + + template + static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID)); + Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + } + + template + static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID)); + if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + return false; + } + }; + + template + struct CurveNiIntersectorK + { + typedef CurveNi Primitive; + typedef Vec3vf Vec3vfM; + typedef LinearSpace3LinearSpace3vfM; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline vbool intersect(RayK& ray, const size_t k, const Primitive& prim, vfloat& tNear_o) + { + const size_t N = prim.N; + const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N)); + const Vec3fa offset = Vec3fa(offset_scale); + const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale)); + + const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]); + const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); + const Vec3fa org1 = (ray_org-offset)*scale; + const Vec3fa dir1 = ray_dir*scale; + + const LinearSpace3vfM space(vfloat::load(prim.bounds_vx_x(N)), vfloat::load(prim.bounds_vx_y(N)), vfloat::load(prim.bounds_vx_z(N)), + vfloat::load(prim.bounds_vy_x(N)), vfloat::load(prim.bounds_vy_y(N)), vfloat::load(prim.bounds_vy_z(N)), + vfloat::load(prim.bounds_vz_x(N)), vfloat::load(prim.bounds_vz_y(N)), vfloat::load(prim.bounds_vz_z(N))); + + const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1)); + const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1)); + const Vec3vfM rcp_dir2 = rcp_safe(dir2); + + const vfloat t_lower_x = (vfloat::load(prim.bounds_vx_lower(N))-vfloat(org2.x))*vfloat(rcp_dir2.x); + const vfloat t_upper_x = (vfloat::load(prim.bounds_vx_upper(N))-vfloat(org2.x))*vfloat(rcp_dir2.x); + const vfloat t_lower_y = (vfloat::load(prim.bounds_vy_lower(N))-vfloat(org2.y))*vfloat(rcp_dir2.y); + const vfloat t_upper_y = (vfloat::load(prim.bounds_vy_upper(N))-vfloat(org2.y))*vfloat(rcp_dir2.y); + const vfloat t_lower_z = (vfloat::load(prim.bounds_vz_lower(N))-vfloat(org2.z))*vfloat(rcp_dir2.z); + const vfloat t_upper_z = (vfloat::load(prim.bounds_vz_upper(N))-vfloat(org2.z))*vfloat(rcp_dir2.z); + + const vfloat round_up (1.0f+3.0f*float(ulp)); + const vfloat round_down(1.0f-3.0f*float(ulp)); + const vfloat tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat(ray.tnear()[k])); + const vfloat tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat(ray.tfar[k])); + tNear_o = tNear; + return (vint(step) < vint(prim.N)) & (tNear <= tFar); + } + + template + static __forceinline void intersect_t(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID)); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + } + + template + static __forceinline bool occluded_t(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID)); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + return false; + } + + template + static __forceinline void intersect_n(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + + unsigned int vertexID = geom->curve(primID); + Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + } + + template + static __forceinline bool occluded_n(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + + unsigned int vertexID = geom->curve(primID); + Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + const unsigned int primID1 = prim.primID(N)[i1]; + geom->prefetchL1_vertices(geom->curve(primID1)); + if (mask1) { + const size_t i2 = bsf(mask1); + const unsigned int primID2 = prim.primID(N)[i2]; + geom->prefetchL2_vertices(geom->curve(primID2)); + } + } + + if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + return false; + } + + template + static __forceinline void intersect_h(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID)); + Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + } + + template + static __forceinline bool occluded_h(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID)); + if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + return false; + } + + template + static __forceinline void intersect_hn(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID)); + Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + } + + template + static __forceinline bool occluded_hn(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID)); + if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + return false; + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h new file mode 100644 index 00000000000..0cd8f833fd4 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h @@ -0,0 +1,278 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + template + struct CurveNiMB + { + struct Type : public PrimitiveType { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* Returns maximum number of stored primitives */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; } + + static __forceinline size_t bytes(size_t N) + { + const size_t f = N/M, r = N%M; + static_assert(sizeof(CurveNiMB) == 6+37*M+24, "internal data layout issue"); + return f*sizeof(CurveNiMB) + (r!=0)*(6+37*r+24); + } + + public: + + /*! Default constructor. */ + __forceinline CurveNiMB () {} + + /*! fill curve from curve list */ + __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t _end, Scene* scene, const BBox1f time_range) + { + size_t end = min(begin+M,_end); + N = (uint8_t)(end-begin); + const unsigned int geomID0 = prims[begin].geomID(); + this->geomID(N) = geomID0; + ty = (uint8_t) scene->get(geomID0)->getType(); + + /* encode all primitives */ + LBBox3fa lbounds = empty; + for (size_t i=0; iget(geomID)->vlinearBounds(primID,time_range)); + } + BBox3fa bounds = lbounds.bounds(); + + /* calculate offset and scale */ + Vec3fa loffset = bounds.lower; + float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f))); + if (bounds.size() == Vec3fa(zero)) lscale = 0.0f; + *this->offset(N) = loffset; + *this->scale(N) = lscale; + this->time_offset(N) = time_range.lower; + this->time_scale(N) = 1.0f/time_range.size(); + + /* encode all primitives */ + for (size_t i=0; iget(geomID)->computeAlignedSpaceMB(primID,time_range); + + const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz)); + const LBBox3fa bounds = scene->get(geomID)->vlinearBounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID,time_range); + + // NOTE: this weird (int8_t) (short) cast works around VS2015 Win32 compiler bug + bounds_vx_x(N)[i] = (int8_t) (short) space3.vx.x; + bounds_vx_y(N)[i] = (int8_t) (short) space3.vx.y; + bounds_vx_z(N)[i] = (int8_t) (short) space3.vx.z; + bounds_vx_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.x),-32767.0f,32767.0f); + bounds_vx_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.x),-32767.0f,32767.0f); + bounds_vx_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.x),-32767.0f,32767.0f); + bounds_vx_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.x),-32767.0f,32767.0f); + assert(-32767.0f <= floor(bounds.bounds0.lower.x) && floor(bounds.bounds0.lower.x) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.bounds0.upper.x) && ceil (bounds.bounds0.upper.x) <= 32767.0f); + assert(-32767.0f <= floor(bounds.bounds1.lower.x) && floor(bounds.bounds1.lower.x) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.bounds1.upper.x) && ceil (bounds.bounds1.upper.x) <= 32767.0f); + + bounds_vy_x(N)[i] = (int8_t) (short) space3.vy.x; + bounds_vy_y(N)[i] = (int8_t) (short) space3.vy.y; + bounds_vy_z(N)[i] = (int8_t) (short) space3.vy.z; + bounds_vy_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.y),-32767.0f,32767.0f); + bounds_vy_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.y),-32767.0f,32767.0f); + bounds_vy_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.y),-32767.0f,32767.0f); + bounds_vy_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.y),-32767.0f,32767.0f); + assert(-32767.0f <= floor(bounds.bounds0.lower.y) && floor(bounds.bounds0.lower.y) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.bounds0.upper.y) && ceil (bounds.bounds0.upper.y) <= 32767.0f); + assert(-32767.0f <= floor(bounds.bounds1.lower.y) && floor(bounds.bounds1.lower.y) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.bounds1.upper.y) && ceil (bounds.bounds1.upper.y) <= 32767.0f); + + bounds_vz_x(N)[i] = (int8_t) (short) space3.vz.x; + bounds_vz_y(N)[i] = (int8_t) (short) space3.vz.y; + bounds_vz_z(N)[i] = (int8_t) (short) space3.vz.z; + bounds_vz_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.z),-32767.0f,32767.0f); + bounds_vz_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.z),-32767.0f,32767.0f); + bounds_vz_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.z),-32767.0f,32767.0f); + bounds_vz_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.z),-32767.0f,32767.0f); + assert(-32767.0f <= floor(bounds.bounds0.lower.z) && floor(bounds.bounds0.lower.z) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.bounds0.upper.z) && ceil (bounds.bounds0.upper.z) <= 32767.0f); + assert(-32767.0f <= floor(bounds.bounds1.lower.z) && floor(bounds.bounds1.lower.z) <= 32767.0f); + assert(-32767.0f <= ceil (bounds.bounds1.upper.z) && ceil (bounds.bounds1.upper.z) <= 32767.0f); + + this->primID(N)[i] = primID; + } + + return lbounds; + } + + template + __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc) + { + size_t start = prims.begin(); + size_t end = prims.end(); + size_t items = CurveNiMB::blocks(prims.size()); + size_t numbytes = CurveNiMB::bytes(prims.size()); + CurveNiMB* accel = (CurveNiMB*) alloc.malloc1(numbytes,BVH::byteAlignment); + const typename BVH::NodeRef node = bvh->encodeLeaf((int8_t*)accel,items); + + LBBox3fa bounds = empty; + for (size_t i=0; idata(),start,end,bvh->scene,prims.time_range)); + + return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range); + }; + + + public: + + // 27.6 - 46 bytes per primitive + uint8_t ty; + uint8_t N; + uint8_t data[4+37*M+24]; + + /* + struct Layout + { + unsigned int geomID; + unsigned int primID[N]; + + int8_t bounds_vx_x[N]; + int8_t bounds_vx_y[N]; + int8_t bounds_vx_z[N]; + short bounds_vx_lower0[N]; + short bounds_vx_upper0[N]; + short bounds_vx_lower1[N]; + short bounds_vx_upper1[N]; + + int8_t bounds_vy_x[N]; + int8_t bounds_vy_y[N]; + int8_t bounds_vy_z[N]; + short bounds_vy_lower0[N]; + short bounds_vy_upper0[N]; + short bounds_vy_lower1[N]; + short bounds_vy_upper1[N]; + + int8_t bounds_vz_x[N]; + int8_t bounds_vz_y[N]; + int8_t bounds_vz_z[N]; + short bounds_vz_lower0[N]; + short bounds_vz_upper0[N]; + short bounds_vz_lower1[N]; + short bounds_vz_upper1[N]; + + Vec3f offset; + float scale; + + float time_offset; + float time_scale; + }; + */ + + __forceinline unsigned int& geomID(size_t N) { return *(unsigned int*)((int8_t*)this+2); } + __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); } + + __forceinline unsigned int* primID(size_t N) { return (unsigned int*)((int8_t*)this+6); } + __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); } + + __forceinline int8_t* bounds_vx_x(size_t N) { return (int8_t*)((int8_t*)this+6+4*N); } + __forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); } + + __forceinline int8_t* bounds_vx_y(size_t N) { return (int8_t*)((int8_t*)this+6+5*N); } + __forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); } + + __forceinline int8_t* bounds_vx_z(size_t N) { return (int8_t*)((int8_t*)this+6+6*N); } + __forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); } + + __forceinline short* bounds_vx_lower0(size_t N) { return (short*)((int8_t*)this+6+7*N); } + __forceinline const short* bounds_vx_lower0(size_t N) const { return (short*)((int8_t*)this+6+7*N); } + + __forceinline short* bounds_vx_upper0(size_t N) { return (short*)((int8_t*)this+6+9*N); } + __forceinline const short* bounds_vx_upper0(size_t N) const { return (short*)((int8_t*)this+6+9*N); } + + __forceinline short* bounds_vx_lower1(size_t N) { return (short*)((int8_t*)this+6+11*N); } + __forceinline const short* bounds_vx_lower1(size_t N) const { return (short*)((int8_t*)this+6+11*N); } + + __forceinline short* bounds_vx_upper1(size_t N) { return (short*)((int8_t*)this+6+13*N); } + __forceinline const short* bounds_vx_upper1(size_t N) const { return (short*)((int8_t*)this+6+13*N); } + + __forceinline int8_t* bounds_vy_x(size_t N) { return (int8_t*)((int8_t*)this+6+15*N); } + __forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+15*N); } + + __forceinline int8_t* bounds_vy_y(size_t N) { return (int8_t*)((int8_t*)this+6+16*N); } + __forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+16*N); } + + __forceinline int8_t* bounds_vy_z(size_t N) { return (int8_t*)((int8_t*)this+6+17*N); } + __forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+17*N); } + + __forceinline short* bounds_vy_lower0(size_t N) { return (short*)((int8_t*)this+6+18*N); } + __forceinline const short* bounds_vy_lower0(size_t N) const { return (short*)((int8_t*)this+6+18*N); } + + __forceinline short* bounds_vy_upper0(size_t N) { return (short*)((int8_t*)this+6+20*N); } + __forceinline const short* bounds_vy_upper0(size_t N) const { return (short*)((int8_t*)this+6+20*N); } + + __forceinline short* bounds_vy_lower1(size_t N) { return (short*)((int8_t*)this+6+22*N); } + __forceinline const short* bounds_vy_lower1(size_t N) const { return (short*)((int8_t*)this+6+22*N); } + + __forceinline short* bounds_vy_upper1(size_t N) { return (short*)((int8_t*)this+6+24*N); } + __forceinline const short* bounds_vy_upper1(size_t N) const { return (short*)((int8_t*)this+6+24*N); } + + __forceinline int8_t* bounds_vz_x(size_t N) { return (int8_t*)((int8_t*)this+6+26*N); } + __forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+26*N); } + + __forceinline int8_t* bounds_vz_y(size_t N) { return (int8_t*)((int8_t*)this+6+27*N); } + __forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+27*N); } + + __forceinline int8_t* bounds_vz_z(size_t N) { return (int8_t*)((int8_t*)this+6+28*N); } + __forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+28*N); } + + __forceinline short* bounds_vz_lower0(size_t N) { return (short*)((int8_t*)this+6+29*N); } + __forceinline const short* bounds_vz_lower0(size_t N) const { return (short*)((int8_t*)this+6+29*N); } + + __forceinline short* bounds_vz_upper0(size_t N) { return (short*)((int8_t*)this+6+31*N); } + __forceinline const short* bounds_vz_upper0(size_t N) const { return (short*)((int8_t*)this+6+31*N); } + + __forceinline short* bounds_vz_lower1(size_t N) { return (short*)((int8_t*)this+6+33*N); } + __forceinline const short* bounds_vz_lower1(size_t N) const { return (short*)((int8_t*)this+6+33*N); } + + __forceinline short* bounds_vz_upper1(size_t N) { return (short*)((int8_t*)this+6+35*N); } + __forceinline const short* bounds_vz_upper1(size_t N) const { return (short*)((int8_t*)this+6+35*N); } + + __forceinline Vec3f* offset(size_t N) { return (Vec3f*)((int8_t*)this+6+37*N); } + __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+37*N); } + + __forceinline float* scale(size_t N) { return (float*)((int8_t*)this+6+37*N+12); } + __forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+37*N+12); } + + __forceinline float& time_offset(size_t N) { return *(float*)((int8_t*)this+6+37*N+16); } + __forceinline const float& time_offset(size_t N) const { return *(float*)((int8_t*)this+6+37*N+16); } + + __forceinline float& time_scale(size_t N) { return *(float*)((int8_t*)this+6+37*N+20); } + __forceinline const float& time_scale(size_t N) const { return *(float*)((int8_t*)this+6+37*N+20); } + + __forceinline int8_t* end(size_t N) { return (int8_t*)this+6+37*N+24; } + __forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+37*N+24; } + }; + + template + typename CurveNiMB::Type CurveNiMB::type; + + typedef CurveNiMB<4> Curve4iMB; + typedef CurveNiMB<8> Curve8iMB; +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h new file mode 100644 index 00000000000..0cbc7646683 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h @@ -0,0 +1,516 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "curveNi_mb.h" +#include "../subdiv/linear_bezier_patch.h" + +namespace embree +{ + namespace isa + { + template + struct CurveNiMBIntersector1 + { + typedef CurveNiMB Primitive; + typedef Vec3vf Vec3vfM; + typedef LinearSpace3LinearSpace3vfM; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline vbool intersect(Ray& ray, const Primitive& prim, vfloat& tNear_o) + { + const size_t N = prim.N; + const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N)); + const Vec3fa offset = Vec3fa(offset_scale); + const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale)); + const Vec3fa org1 = (ray.org-offset)*scale; + const Vec3fa dir1 = ray.dir*scale; + + const LinearSpace3vfM space(vfloat::load(prim.bounds_vx_x(N)), vfloat::load(prim.bounds_vx_y(N)), vfloat::load(prim.bounds_vx_z(N)), + vfloat::load(prim.bounds_vy_x(N)), vfloat::load(prim.bounds_vy_y(N)), vfloat::load(prim.bounds_vy_z(N)), + vfloat::load(prim.bounds_vz_x(N)), vfloat::load(prim.bounds_vz_y(N)), vfloat::load(prim.bounds_vz_z(N))); + + const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1)); + const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1)); + const Vec3vfM rcp_dir2 = rcp_safe(dir2); + + const vfloat ltime = (ray.time()-prim.time_offset(N))*prim.time_scale(N); + const vfloat vx_lower0 = vfloat::load(prim.bounds_vx_lower0(N)); + const vfloat vx_lower1 = vfloat::load(prim.bounds_vx_lower1(N)); + const vfloat vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0); + const vfloat vx_upper0 = vfloat::load(prim.bounds_vx_upper0(N)); + const vfloat vx_upper1 = vfloat::load(prim.bounds_vx_upper1(N)); + const vfloat vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0); + + const vfloat vy_lower0 = vfloat::load(prim.bounds_vy_lower0(N)); + const vfloat vy_lower1 = vfloat::load(prim.bounds_vy_lower1(N)); + const vfloat vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0); + const vfloat vy_upper0 = vfloat::load(prim.bounds_vy_upper0(N)); + const vfloat vy_upper1 = vfloat::load(prim.bounds_vy_upper1(N)); + const vfloat vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0); + + const vfloat vz_lower0 = vfloat::load(prim.bounds_vz_lower0(N)); + const vfloat vz_lower1 = vfloat::load(prim.bounds_vz_lower1(N)); + const vfloat vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0); + const vfloat vz_upper0 = vfloat::load(prim.bounds_vz_upper0(N)); + const vfloat vz_upper1 = vfloat::load(prim.bounds_vz_upper1(N)); + const vfloat vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0); + + const vfloat t_lower_x = (vx_lower-vfloat(org2.x))*vfloat(rcp_dir2.x); + const vfloat t_upper_x = (vx_upper-vfloat(org2.x))*vfloat(rcp_dir2.x); + const vfloat t_lower_y = (vy_lower-vfloat(org2.y))*vfloat(rcp_dir2.y); + const vfloat t_upper_y = (vy_upper-vfloat(org2.y))*vfloat(rcp_dir2.y); + const vfloat t_lower_z = (vz_lower-vfloat(org2.z))*vfloat(rcp_dir2.z); + const vfloat t_upper_z = (vz_upper-vfloat(org2.z))*vfloat(rcp_dir2.z); + + const vfloat round_up (1.0f+3.0f*float(ulp)); + const vfloat round_down(1.0f-3.0f*float(ulp)); + const vfloat tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat(ray.tnear())); + const vfloat tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat(ray.tfar)); + tNear_o = tNear; + return (vint(step) < vint(prim.N)) & (tNear <= tFar); + } + + template + static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()); + + Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + } + + template + static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()); + + if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + return false; + } + + template + static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve(context, ray.org, primID,ray.time()); + Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + } + + template + static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve(context, ray.org, primID,ray.time()); + + if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + return false; + } + + template + static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()); + Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + } + + template + static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()); + if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + return false; + } + + template + static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve(context, ray.org, primID,ray.time()); + Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + } + + template + static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve(context, ray.org, primID,ray.time()); + if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + return false; + } + }; + + template + struct CurveNiMBIntersectorK + { + typedef CurveNiMB Primitive; + typedef Vec3vf Vec3vfM; + typedef LinearSpace3LinearSpace3vfM; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline vbool intersect(RayK& ray, const size_t k, const Primitive& prim, vfloat& tNear_o) + { + const size_t N = prim.N; + const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N)); + const Vec3fa offset = Vec3fa(offset_scale); + const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale)); + + const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]); + const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); + const Vec3fa org1 = (ray_org-offset)*scale; + const Vec3fa dir1 = ray_dir*scale; + + const LinearSpace3vfM space(vfloat::load(prim.bounds_vx_x(N)), vfloat::load(prim.bounds_vx_y(N)), vfloat::load(prim.bounds_vx_z(N)), + vfloat::load(prim.bounds_vy_x(N)), vfloat::load(prim.bounds_vy_y(N)), vfloat::load(prim.bounds_vy_z(N)), + vfloat::load(prim.bounds_vz_x(N)), vfloat::load(prim.bounds_vz_y(N)), vfloat::load(prim.bounds_vz_z(N))); + + const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1)); + const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1)); + const Vec3vfM rcp_dir2 = rcp_safe(dir2); + + const vfloat ltime = (ray.time()[k]-prim.time_offset(N))*prim.time_scale(N); + const vfloat vx_lower0 = vfloat::load(prim.bounds_vx_lower0(N)); + const vfloat vx_lower1 = vfloat::load(prim.bounds_vx_lower1(N)); + const vfloat vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0); + const vfloat vx_upper0 = vfloat::load(prim.bounds_vx_upper0(N)); + const vfloat vx_upper1 = vfloat::load(prim.bounds_vx_upper1(N)); + const vfloat vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0); + + const vfloat vy_lower0 = vfloat::load(prim.bounds_vy_lower0(N)); + const vfloat vy_lower1 = vfloat::load(prim.bounds_vy_lower1(N)); + const vfloat vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0); + const vfloat vy_upper0 = vfloat::load(prim.bounds_vy_upper0(N)); + const vfloat vy_upper1 = vfloat::load(prim.bounds_vy_upper1(N)); + const vfloat vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0); + + const vfloat vz_lower0 = vfloat::load(prim.bounds_vz_lower0(N)); + const vfloat vz_lower1 = vfloat::load(prim.bounds_vz_lower1(N)); + const vfloat vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0); + const vfloat vz_upper0 = vfloat::load(prim.bounds_vz_upper0(N)); + const vfloat vz_upper1 = vfloat::load(prim.bounds_vz_upper1(N)); + const vfloat vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0); + + const vfloat t_lower_x = (vx_lower-vfloat(org2.x))*vfloat(rcp_dir2.x); + const vfloat t_upper_x = (vx_upper-vfloat(org2.x))*vfloat(rcp_dir2.x); + const vfloat t_lower_y = (vy_lower-vfloat(org2.y))*vfloat(rcp_dir2.y); + const vfloat t_upper_y = (vy_upper-vfloat(org2.y))*vfloat(rcp_dir2.y); + const vfloat t_lower_z = (vz_lower-vfloat(org2.z))*vfloat(rcp_dir2.z); + const vfloat t_upper_z = (vz_upper-vfloat(org2.z))*vfloat(rcp_dir2.z); + + const vfloat round_up (1.0f+3.0f*float(ulp)); + const vfloat round_down(1.0f-3.0f*float(ulp)); + const vfloat tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat(ray.tnear()[k])); + const vfloat tFar = round_up *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat(ray.tfar[k])); + tNear_o = tNear; + return (vint(step) < vint(prim.N)) & (tNear <= tFar); + } + + template + static __forceinline void intersect_t(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]); + + Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + } + + template + static __forceinline bool occluded_t(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]); + + if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + return false; + } + + template + static __forceinline void intersect_n(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve(context, ray_org, primID,ray.time()[k]); + Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + } + + template + static __forceinline bool occluded_n(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve(context, ray_org, primID,ray.time()[k]); + + if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + return false; + } + + template + static __forceinline void intersect_h(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]); + Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + } + + template + static __forceinline bool occluded_h(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]); + if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + return false; + } + + template + static __forceinline void intersect_hn(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve(context, ray_org, primID,ray.time()[k]); + Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + } + + template + static __forceinline bool occluded_hn(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = context->scene->get(geomID); + const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve(context, ray_org, primID,ray.time()[k]); + if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + return false; + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNv.h b/thirdparty/embree-aarch64/kernels/geometry/curveNv.h new file mode 100644 index 00000000000..6eb5e30b39a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curveNv.h @@ -0,0 +1,101 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "curveNi.h" + +namespace embree +{ + template + struct CurveNv : public CurveNi + { + using CurveNi::N; + + struct Type : public PrimitiveType { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* Returns maximum number of stored primitives */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; } + + static __forceinline size_t bytes(size_t N) + { + const size_t f = N/M, r = N%M; + static_assert(sizeof(CurveNv) == 22+25*M+4*16*M, "internal data layout issue"); + return f*sizeof(CurveNv) + (r!=0)*(22 + 25*r + 4*16*r); + } + + public: + + /*! Default constructor. */ + __forceinline CurveNv () {} + + /*! fill curve from curve list */ + __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene) + { + size_t end = min(begin+M,_end); + size_t N = end-begin; + + /* encode all primitives */ + for (size_t i=0; iget(geomID); + const unsigned vtxID = mesh->curve(primID); + Vec3fa::storeu(&this->vertices(i,N)[0],mesh->vertex(vtxID+0)); + Vec3fa::storeu(&this->vertices(i,N)[1],mesh->vertex(vtxID+1)); + Vec3fa::storeu(&this->vertices(i,N)[2],mesh->vertex(vtxID+2)); + Vec3fa::storeu(&this->vertices(i,N)[3],mesh->vertex(vtxID+3)); + } + } + + template + __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range& set, const Allocator& alloc) + { + if (set.size() == 0) + return BVH::emptyNode; + + /* fall back to CurveNi for oriented curves */ + unsigned int geomID = prims[set.begin()].geomID(); + if (bvh->scene->get(geomID)->getCurveType() == Geometry::GTY_SUBTYPE_ORIENTED_CURVE) { + return CurveNi::createLeaf(bvh,prims,set,alloc); + } + if (bvh->scene->get(geomID)->getCurveBasis() == Geometry::GTY_BASIS_HERMITE) { + return CurveNi::createLeaf(bvh,prims,set,alloc); + } + + size_t start = set.begin(); + size_t items = CurveNv::blocks(set.size()); + size_t numbytes = CurveNv::bytes(set.size()); + CurveNv* accel = (CurveNv*) alloc.malloc1(numbytes,BVH::byteAlignment); + for (size_t i=0; i::fill(prims,start,set.end(),bvh->scene); + accel[i].CurveNi::fill(prims,start,set.end(),bvh->scene); + } + return bvh->encodeLeaf((char*)accel,items); + }; + + public: + unsigned char data[4*16*M]; + __forceinline Vec3fa* vertices(size_t i, size_t N) { return (Vec3fa*)CurveNi::end(N)+4*i; } + __forceinline const Vec3fa* vertices(size_t i, size_t N) const { return (Vec3fa*)CurveNi::end(N)+4*i; } + }; + + template + typename CurveNv::Type CurveNv::type; + + typedef CurveNv<4> Curve4v; + typedef CurveNv<8> Curve8v; +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h new file mode 100644 index 00000000000..e20da2882ef --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h @@ -0,0 +1,181 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "curveNv.h" +#include "curveNi_intersector.h" + +namespace embree +{ + namespace isa + { + template + struct CurveNvIntersector1 : public CurveNiIntersector1 + { + typedef CurveNv Primitive; + typedef CurvePrecalculations1 Precalculations; + + template + static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = CurveNiIntersector1::intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID); + const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]); + const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]); + const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]); + const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + prefetchL1(&prim.vertices(i1,N)[0]); + prefetchL1(&prim.vertices(i1,N)[4]); + if (mask1) { + const size_t i2 = bsf(mask1); + prefetchL2(&prim.vertices(i2,N)[0]); + prefetchL2(&prim.vertices(i2,N)[4]); + } + } + + Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + } + + template + static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = CurveNiIntersector1::intersect(ray,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID); + const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]); + const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]); + const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]); + const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + prefetchL1(&prim.vertices(i1,N)[0]); + prefetchL1(&prim.vertices(i1,N)[4]); + if (mask1) { + const size_t i2 = bsf(mask1); + prefetchL2(&prim.vertices(i2,N)[0]); + prefetchL2(&prim.vertices(i2,N)[4]); + } + } + + if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar)); + } + return false; + } + }; + + template + struct CurveNvIntersectorK : public CurveNiIntersectorK + { + typedef CurveNv Primitive; + typedef CurvePrecalculationsK Precalculations; + + template + static __forceinline void intersect_t(Precalculations& pre, RayHitK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = CurveNiIntersectorK::intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(normal.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID); + const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]); + const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]); + const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]); + const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + prefetchL1(&prim.vertices(i1,N)[0]); + prefetchL1(&prim.vertices(i1,N)[4]); + if (mask1) { + const size_t i2 = bsf(mask1); + prefetchL2(&prim.vertices(i2,N)[0]); + prefetchL2(&prim.vertices(i2,N)[4]); + } + } + + Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)); + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + } + + template + static __forceinline bool occluded_t(Precalculations& pre, RayK& ray, const size_t k, IntersectContext* context, const Primitive& prim) + { + vfloat tNear; + vbool valid = CurveNiIntersectorK::intersect(ray,k,prim,tNear); + + const size_t N = prim.N; + size_t mask = movemask(valid); + while (mask) + { + const size_t i = bscf(mask); + STAT3(shadow.trav_prims,1,1,1); + const unsigned int geomID = prim.geomID(N); + const unsigned int primID = prim.primID(N)[i]; + const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID); + const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]); + const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]); + const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]); + const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]); + + size_t mask1 = mask; + const size_t i1 = bscf(mask1); + if (mask) { + prefetchL1(&prim.vertices(i1,N)[0]); + prefetchL1(&prim.vertices(i1,N)[4]); + if (mask1) { + const size_t i2 = bsf(mask1); + prefetchL2(&prim.vertices(i2,N)[0]); + prefetchL2(&prim.vertices(i2,N)[4]); + } + } + + if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID))) + return true; + + mask &= movemask(tNear <= vfloat(ray.tfar[k])); + } + return false; + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h new file mode 100644 index 00000000000..204958f7cc3 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h @@ -0,0 +1,98 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" +#include "../subdiv/bezier_curve.h" +#include "../common/primref.h" +#include "bezier_hair_intersector.h" +#include "bezier_ribbon_intersector.h" +#include "bezier_curve_intersector.h" +#include "oriented_curve_intersector.h" +#include "../bvh/node_intersector1.h" + +// FIXME: this file seems replicate of curve_intersector_virtual.h + +namespace embree +{ + namespace isa + { + struct VirtualCurveIntersector1 + { + typedef unsigned char Primitive; + typedef CurvePrecalculations1 Precalculations; + + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; + leafIntersector.intersect<1>(&pre,&ray,context,prim); + } + + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; + return leafIntersector.occluded<1>(&pre,&ray,context,prim); + } + }; + + template + struct VirtualCurveIntersectorK + { + typedef unsigned char Primitive; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline void intersect(const vbool& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; + size_t mask = movemask(valid_i); + while (mask) leafIntersector.intersect(&pre,&ray,bscf(mask),context,prim); + } + + static __forceinline vbool occluded(const vbool& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; + vbool valid_o = false; + size_t mask = movemask(valid_i); + while (mask) { + size_t k = bscf(mask); + if (leafIntersector.occluded(&pre,&ray,k,context,prim)) + set(valid_o, k); + } + return valid_o; + } + + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; + leafIntersector.intersect(&pre,&ray,k,context,prim); + } + + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty]; + return leafIntersector.occluded(&pre,&ray,k,context,prim); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h new file mode 100644 index 00000000000..343cc8ff28f --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h @@ -0,0 +1,129 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + namespace isa + { + template + struct DistanceCurveHit + { + __forceinline DistanceCurveHit() {} + + __forceinline DistanceCurveHit(const vbool& valid, const vfloat& U, const vfloat& V, const vfloat& T, const int i, const int N, + const NativeCurve3fa& curve3D) + : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {} + + __forceinline void finalize() + { + vu = (vfloat(step)+U+vfloat(float(i)))*(1.0f/float(N)); + vv = V; + vt = T; + } + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { + return curve3D.eval_du(vu[i]); + } + + public: + vfloat U; + vfloat V; + vfloat T; + int i, N; + NativeCurve3fa curve3D; + + public: + vbool valid; + vfloat vu; + vfloat vv; + vfloat vt; + }; + + template + struct DistanceCurve1Intersector1 + { + template + __forceinline bool intersect(const CurvePrecalculations1& pre,Ray& ray, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2, const Vec3fa& v3, + const Epilog& epilog) + { + const int N = geom->tessellationRate; + + /* transform control points into ray space */ + const NativeCurve3fa curve3Di(v0,v1,v2,v3); + const NativeCurve3fa curve3D = enlargeRadiusToMinWidth(context,geom,ray.org,curve3Di); + const NativeCurve3fa curve2D = curve3D.xfm_pr(pre.ray_space,ray.org); + + /* evaluate the bezier curve */ + vboolx valid = vfloatx(step) < vfloatx(float(N)); + const Vec4vfx p0 = curve2D.template eval0(0,N); + const Vec4vfx p1 = curve2D.template eval1(0,N); + + /* approximative intersection with cone */ + const Vec4vfx v = p1-p0; + const Vec4vfx w = -p0; + const vfloatx d0 = madd(w.x,v.x,w.y*v.y); + const vfloatx d1 = madd(v.x,v.x,v.y*v.y); + const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one)); + const Vec4vfx p = madd(u,v,p0); + const vfloatx t = p.z*pre.depth_scale; + const vfloatx d2 = madd(p.x,p.x,p.y*p.y); + const vfloatx r = p.w; + const vfloatx r2 = r*r; + valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar)); + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) + valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections + + /* update hit information */ + bool ishit = false; + if (unlikely(any(valid))) { + DistanceCurveHit hit(valid,u,0.0f,t,0,N,curve3D); + ishit = ishit | epilog(valid,hit); + } + + if (unlikely(VSIZEX < N)) + { + /* process SIMD-size many segments per iteration */ + for (int i=VSIZEX; i(i,N); + const Vec4vfx p1 = curve2D.template eval1(i,N); + + /* approximative intersection with cone */ + const Vec4vfx v = p1-p0; + const Vec4vfx w = -p0; + const vfloatx d0 = madd(w.x,v.x,w.y*v.y); + const vfloatx d1 = madd(v.x,v.x,v.y*v.y); + const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one)); + const Vec4vfx p = madd(u,v,p0); + const vfloatx t = p.z*pre.depth_scale; + const vfloatx d2 = madd(p.x,p.x,p.y*p.y); + const vfloatx r = p.w; + const vfloatx r2 = r*r; + valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar)); + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) + valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections + + /* update hit information */ + if (unlikely(any(valid))) { + DistanceCurveHit hit(valid,u,0.0f,t,i,N,curve3D); + ishit = ishit | epilog(valid,hit); + } + } + } + return ishit; + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h new file mode 100644 index 00000000000..47531027fc3 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h @@ -0,0 +1,417 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "curve_intersector_precalculations.h" +#include "curve_intersector_sweep.h" +#include "../subdiv/linear_bezier_patch.h" + +#define DBG(x) + +namespace embree +{ + namespace isa + { + template + struct TensorLinearCubicBezierSurfaceIntersector + { + const LinearSpace3fa& ray_space; + Ray& ray; + TensorLinearCubicBezierSurface3fa curve3d; + TensorLinearCubicBezierSurface2fa curve2d; + float eps; + const Epilog& epilog; + bool isHit; + + __forceinline TensorLinearCubicBezierSurfaceIntersector (const LinearSpace3fa& ray_space, Ray& ray, const TensorLinearCubicBezierSurface3fa& curve3d, const Epilog& epilog) + : ray_space(ray_space), ray(ray), curve3d(curve3d), epilog(epilog), isHit(false) + { + const TensorLinearCubicBezierSurface3fa curve3dray = curve3d.xfm(ray_space,ray.org); + curve2d = TensorLinearCubicBezierSurface2fa(CubicBezierCurve2fa(curve3dray.L),CubicBezierCurve2fa(curve3dray.R)); + const BBox2fa b2 = curve2d.bounds(); + eps = 8.0f*float(ulp)*reduce_max(max(abs(b2.lower),abs(b2.upper))); + } + + __forceinline Interval1f solve_linear(const float u0, const float u1, const float& p0, const float& p1) + { + if (p1 == p0) { + if (p0 == 0.0f) return Interval1f(u0,u1); + else return Interval1f(empty); + } + const float t = -p0/(p1-p0); + const float tt = lerp(u0,u1,t); + return Interval1f(tt); + } + + __forceinline void solve_linear(const float u0, const float u1, const Interval1f& p0, const Interval1f& p1, Interval1f& u) + { + if (sign(p0.lower) != sign(p0.upper)) u.extend(u0); + if (sign(p0.lower) != sign(p1.lower)) u.extend(solve_linear(u0,u1,p0.lower,p1.lower)); + if (sign(p0.upper) != sign(p1.upper)) u.extend(solve_linear(u0,u1,p0.upper,p1.upper)); + if (sign(p1.lower) != sign(p1.upper)) u.extend(u1); + } + + __forceinline Interval1f bezier_clipping(const CubicBezierCurve& curve) + { + Interval1f u = empty; + solve_linear(0.0f/3.0f,1.0f/3.0f,curve.v0,curve.v1,u); + solve_linear(0.0f/3.0f,2.0f/3.0f,curve.v0,curve.v2,u); + solve_linear(0.0f/3.0f,3.0f/3.0f,curve.v0,curve.v3,u); + solve_linear(1.0f/3.0f,2.0f/3.0f,curve.v1,curve.v2,u); + solve_linear(1.0f/3.0f,3.0f/3.0f,curve.v1,curve.v3,u); + solve_linear(2.0f/3.0f,3.0f/3.0f,curve.v2,curve.v3,u); + return intersect(u,Interval1f(0.0f,1.0f)); + } + + __forceinline Interval1f bezier_clipping(const LinearBezierCurve& curve) + { + Interval1f v = empty; + solve_linear(0.0f,1.0f,curve.v0,curve.v1,v); + return intersect(v,Interval1f(0.0f,1.0f)); + } + + __forceinline void solve_bezier_clipping(BBox1f cu, BBox1f cv, const TensorLinearCubicBezierSurface2fa& curve2) + { + BBox2fa bounds = curve2.bounds(); + if (bounds.upper.x < 0.0f) return; + if (bounds.upper.y < 0.0f) return; + if (bounds.lower.x > 0.0f) return; + if (bounds.lower.y > 0.0f) return; + + if (max(cu.size(),cv.size()) < 1E-4f) + { + const float u = cu.center(); + const float v = cv.center(); + TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org); + const float t = curve_z.eval(u,v); + if (ray.tnear() <= t && t <= ray.tfar) { + const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v)); + BezierCurveHit hit(t,u,v,Ng); + isHit |= epilog(hit); + } + return; + } + + const Vec2fa dv = curve2.axis_v(); + const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv); + LinearBezierCurve curve0v = curve1v.reduce_u(); + if (!curve0v.hasRoot()) return; + + const Interval1f v = bezier_clipping(curve0v); + if (isEmpty(v)) return; + TensorLinearCubicBezierSurface2fa curve2a = curve2.clip_v(v); + cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper)); + + const Vec2fa du = curve2.axis_u(); + const TensorLinearCubicBezierSurface1f curve1u = curve2a.xfm(du); + CubicBezierCurve curve0u = curve1u.reduce_v(); + int roots = curve0u.maxRoots(); + if (roots == 0) return; + + if (roots == 1) + { + const Interval1f u = bezier_clipping(curve0u); + if (isEmpty(u)) return; + TensorLinearCubicBezierSurface2fa curve2b = curve2a.clip_u(u); + cu = BBox1f(lerp(cu.lower,cu.upper,u.lower),lerp(cu.lower,cu.upper,u.upper)); + solve_bezier_clipping(cu,cv,curve2b); + return; + } + + TensorLinearCubicBezierSurface2fa curve2l, curve2r; + curve2a.split_u(curve2l,curve2r); + solve_bezier_clipping(BBox1f(cu.lower,cu.center()),cv,curve2l); + solve_bezier_clipping(BBox1f(cu.center(),cu.upper),cv,curve2r); + } + + __forceinline bool solve_bezier_clipping() + { + solve_bezier_clipping(BBox1f(0.0f,1.0f),BBox1f(0.0f,1.0f),curve2d); + return isHit; + } + + __forceinline void solve_newton_raphson(BBox1f cu, BBox1f cv) + { + Vec2fa uv(cu.center(),cv.center()); + const Vec2fa dfdu = curve2d.eval_du(uv.x,uv.y); + const Vec2fa dfdv = curve2d.eval_dv(uv.x,uv.y); + const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv)); + solve_newton_raphson_loop(cu,cv,uv,dfdu,dfdv,rcp_J); + } + + __forceinline void solve_newton_raphson_loop(BBox1f cu, BBox1f cv, const Vec2fa& uv_in, const Vec2fa& dfdu, const Vec2fa& dfdv, const LinearSpace2fa& rcp_J) + { + Vec2fa uv = uv_in; + + for (size_t i=0; i<200; i++) + { + const Vec2fa f = curve2d.eval(uv.x,uv.y); + const Vec2fa duv = rcp_J*f; + uv -= duv; + + if (max(abs(f.x),abs(f.y)) < eps) + { + const float u = uv.x; + const float v = uv.y; + if (!(u >= 0.0f && u <= 1.0f)) return; // rejects NaNs + if (!(v >= 0.0f && v <= 1.0f)) return; // rejects NaNs + const TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org); + const float t = curve_z.eval(u,v); + if (!(ray.tnear() <= t && t <= ray.tfar)) return; // rejects NaNs + const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v)); + BezierCurveHit hit(t,u,v,Ng); + isHit |= epilog(hit); + return; + } + } + } + + __forceinline bool clip_v(BBox1f& cu, BBox1f& cv) + { + const Vec2fa dv = curve2d.eval_dv(cu.lower,cv.lower); + const TensorLinearCubicBezierSurface1f curve1v = curve2d.xfm(dv).clip(cu,cv); + LinearBezierCurve curve0v = curve1v.reduce_u(); + if (!curve0v.hasRoot()) return false; + Interval1f v = bezier_clipping(curve0v); + if (isEmpty(v)) return false; + v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f)); + cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper)); + return true; + } + + __forceinline bool solve_krawczyk(bool very_small, BBox1f& cu, BBox1f& cv) + { + /* perform bezier clipping in v-direction to get tight v-bounds */ + TensorLinearCubicBezierSurface2fa curve2 = curve2d.clip(cu,cv); + const Vec2fa dv = curve2.axis_v(); + const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv); + LinearBezierCurve curve0v = curve1v.reduce_u(); + if (unlikely(!curve0v.hasRoot())) return true; + Interval1f v = bezier_clipping(curve0v); + if (unlikely(isEmpty(v))) return true; + v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f)); + curve2 = curve2.clip_v(v); + cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper)); + + /* perform one newton raphson iteration */ + Vec2fa c(cu.center(),cv.center()); + Vec2fa f,dfdu,dfdv; curve2d.eval(c.x,c.y,f,dfdu,dfdv); + const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv)); + const Vec2fa c1 = c - rcp_J*f; + + /* calculate bounds of derivatives */ + const BBox2fa bounds_du = (1.0f/cu.size())*curve2.derivative_u().bounds(); + const BBox2fa bounds_dv = (1.0f/cv.size())*curve2.derivative_v().bounds(); + + /* calculate krawczyk test */ + LinearSpace2> I(Interval1f(1.0f), Interval1f(0.0f), + Interval1f(0.0f), Interval1f(1.0f)); + + LinearSpace2> G(Interval1f(bounds_du.lower.x,bounds_du.upper.x), Interval1f(bounds_dv.lower.x,bounds_dv.upper.x), + Interval1f(bounds_du.lower.y,bounds_du.upper.y), Interval1f(bounds_dv.lower.y,bounds_dv.upper.y)); + + const LinearSpace2 rcp_J2(rcp_J); + const LinearSpace2> rcp_Ji(rcp_J2); + + const Vec2 x(cu,cv); + const Vec2 K = Vec2(Vec2f(c1)) + (I - rcp_Ji*G)*(x-Vec2(Vec2f(c))); + + /* test if there is no solution */ + const Vec2 KK = intersect(K,x); + if (unlikely(isEmpty(KK.x) || isEmpty(KK.y))) return true; + + /* exit if convergence cannot get proven, but terminate if we are very small */ + if (unlikely(!subset(K,x) && !very_small)) return false; + + /* solve using newton raphson iteration of convergence is guarenteed */ + solve_newton_raphson_loop(cu,cv,c1,dfdu,dfdv,rcp_J); + return true; + } + + __forceinline void solve_newton_raphson_no_recursion(BBox1f cu, BBox1f cv) + { + if (!clip_v(cu,cv)) return; + return solve_newton_raphson(cu,cv); + } + + __forceinline void solve_newton_raphson_recursion(BBox1f cu, BBox1f cv) + { + unsigned int sptr = 0; + const unsigned int stack_size = 4; + unsigned int mask_stack[stack_size]; + BBox1f cu_stack[stack_size]; + BBox1f cv_stack[stack_size]; + goto entry; + + /* terminate if stack is empty */ + while (sptr) + { + /* pop from stack */ + { + sptr--; + size_t mask = mask_stack[sptr]; + cu = cu_stack[sptr]; + cv = cv_stack[sptr]; + const size_t i = bscf(mask); + mask_stack[sptr] = mask; + if (mask) sptr++; // there are still items on the stack + + /* process next element recurse into each hit curve segment */ + const float u0 = float(i+0)*(1.0f/(VSIZEX-1)); + const float u1 = float(i+1)*(1.0f/(VSIZEX-1)); + const BBox1f cui(lerp(cu.lower,cu.upper,u0),lerp(cu.lower,cu.upper,u1)); + cu = cui; + } + +#if 0 + solve_newton_raphson_no_recursion(cu,cv); + continue; + +#else + /* we assume convergence for small u ranges and verify using krawczyk */ + if (cu.size() < 1.0f/6.0f) { + const bool very_small = cu.size() < 0.001f || sptr >= stack_size; + if (solve_krawczyk(very_small,cu,cv)) { + continue; + } + } +#endif + + entry: + + /* split the curve into VSIZEX-1 segments in u-direction */ + vboolx valid = true; + TensorLinearCubicBezierSurface subcurves = curve2d.clip_v(cv).vsplit_u(valid,cu); + + /* slabs test in u-direction */ + Vec2vfx ndv = cross(subcurves.axis_v()); + BBox boundsv = subcurves.vxfm(ndv).bounds(); + valid &= boundsv.lower <= eps; + valid &= boundsv.upper >= -eps; + if (none(valid)) continue; + + /* slabs test in v-direction */ + Vec2vfx ndu = cross(subcurves.axis_u()); + BBox boundsu = subcurves.vxfm(ndu).bounds(); + valid &= boundsu.lower <= eps; + valid &= boundsu.upper >= -eps; + if (none(valid)) continue; + + /* push valid segments to stack */ + assert(sptr < stack_size); + mask_stack [sptr] = movemask(valid); + cu_stack [sptr] = cu; + cv_stack [sptr] = cv; + sptr++; + } + } + + __forceinline bool solve_newton_raphson_main() + { + BBox1f vu(0.0f,1.0f); + BBox1f vv(0.0f,1.0f); + solve_newton_raphson_recursion(vu,vv); + return isHit; + } + }; + + + template class SourceCurve> + struct OrientedCurve1Intersector1 + { + //template using Curve = SourceCurve; + typedef SourceCurve SourceCurve3ff; + typedef SourceCurve SourceCurve3fa; + + __forceinline OrientedCurve1Intersector1() {} + + __forceinline OrientedCurve1Intersector1(const Ray& ray, const void* ptr) {} + + template + __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i, + const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i, + const Epilog& epilog) const + { + STAT3(normal.trav_prims,1,1,1); + + SourceCurve3ff ccurve(v0i,v1i,v2i,v3i); + SourceCurve3fa ncurve(n0i,n1i,n2i,n3i); + ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve); + TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve); + //return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space,ray,curve,epilog).solve_bezier_clipping(); + return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main(); + } + + template + __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const TensorLinearCubicBezierSurface3fa& curve, const Epilog& epilog) const + { + STAT3(normal.trav_prims,1,1,1); + //return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space,ray,curve,epilog).solve_bezier_clipping(); + return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main(); + } + }; + + template class SourceCurve, int K> + struct OrientedCurve1IntersectorK + { + //template using Curve = SourceCurve; + typedef SourceCurve SourceCurve3ff; + typedef SourceCurve SourceCurve3fa; + + struct Ray1 + { + __forceinline Ray1(RayK& ray, size_t k) + : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {} + + Vec3fa org; + Vec3fa dir; + float _tnear; + float& tfar; + + __forceinline float& tnear() { return _tnear; } + //__forceinline float& tfar() { return _tfar; } + __forceinline const float& tnear() const { return _tnear; } + //__forceinline const float& tfar() const { return _tfar; } + }; + + template + __forceinline bool intersect(const CurvePrecalculationsK& pre, RayK& vray, size_t k, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i, + const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i, + const Epilog& epilog) + { + STAT3(normal.trav_prims,1,1,1); + Ray1 ray(vray,k); + SourceCurve3ff ccurve(v0i,v1i,v2i,v3i); + SourceCurve3fa ncurve(n0i,n1i,n2i,n3i); + ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve); + TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve); + //return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping(); + return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main(); + } + + template + __forceinline bool intersect(const CurvePrecalculationsK& pre, RayK& vray, size_t k, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const TensorLinearCubicBezierSurface3fa& curve, + const Epilog& epilog) + { + STAT3(normal.trav_prims,1,1,1); + Ray1 ray(vray,k); + //return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping(); + return TensorLinearCubicBezierSurfaceIntersector(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main(); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h new file mode 100644 index 00000000000..6e9fc919257 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h @@ -0,0 +1,49 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "../common/geometry.h" + +namespace embree +{ + namespace isa + { + struct CurvePrecalculations1 + { + float depth_scale; + LinearSpace3fa ray_space; + + __forceinline CurvePrecalculations1() {} + + __forceinline CurvePrecalculations1(const Ray& ray, const void* ptr) + { + depth_scale = rsqrt(dot(ray.dir,ray.dir)); + LinearSpace3fa space = frame(depth_scale*ray.dir); + space.vz *= depth_scale; + ray_space = space.transposed(); + } + }; + + template + struct CurvePrecalculationsK + { + vfloat depth_scale; + LinearSpace3fa ray_space[K]; + + __forceinline CurvePrecalculationsK(const vbool& valid, const RayK& ray) + { + size_t mask = movemask(valid); + depth_scale = rsqrt(dot(ray.dir,ray.dir)); + while (mask) { + size_t k = bscf(mask); + Vec3fa ray_dir_k = Vec3fa(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); + LinearSpace3fa ray_space_k = frame(depth_scale[k]*ray_dir_k); + ray_space_k.vz *= depth_scale[k]; + ray_space[k] = ray_space_k.transposed(); + } + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h new file mode 100644 index 00000000000..a99cf99d56c --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h @@ -0,0 +1,214 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "quad_intersector.h" +#include "curve_intersector_precalculations.h" + +#define Bezier1Intersector1 RibbonCurve1Intersector1 +#define Bezier1IntersectorK RibbonCurve1IntersectorK + +namespace embree +{ + namespace isa + { + template + struct RibbonHit + { + __forceinline RibbonHit() {} + + __forceinline RibbonHit(const vbool& valid, const vfloat& U, const vfloat& V, const vfloat& T, const int i, const int N, + const NativeCurve3ff& curve3D) + : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {} + + __forceinline void finalize() + { + vu = (vfloat(step)+U+vfloat(float(i)))*(1.0f/float(N)); + vv = V; + vt = T; + } + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { + return curve3D.eval_du(vu[i]); + } + + public: + vfloat U; + vfloat V; + vfloat T; + int i, N; + NativeCurve3ff curve3D; + + public: + vbool valid; + vfloat vu; + vfloat vv; + vfloat vt; + }; + + /* calculate squared distance of point p0 to line p1->p2 */ + __forceinline std::pair sqr_point_line_distance(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2) + { + const vfloatx num = det(p2-p1,p1-p0); + const vfloatx den2 = dot(p2-p1,p2-p1); + return std::make_pair(num*num,den2); + } + + /* performs culling against a cylinder */ + __forceinline vboolx cylinder_culling_test(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2, const vfloatx& r) + { + const std::pair d = sqr_point_line_distance(p0,p1,p2); + return d.first <= r*r*d.second; + } + + template + __forceinline bool intersect_ribbon(const Vec3fa& ray_org, const Vec3fa& ray_dir, const float ray_tnear, const float& ray_tfar, + const LinearSpace3fa& ray_space, const float& depth_scale, + const NativeCurve3ff& curve3D, const int N, + const Epilog& epilog) + { + /* transform control points into ray space */ + const NativeCurve3ff curve2D = curve3D.xfm_pr(ray_space,ray_org); + float eps = 4.0f*float(ulp)*reduce_max(max(abs(curve2D.v0),abs(curve2D.v1),abs(curve2D.v2),abs(curve2D.v3))); + + /* evaluate the bezier curve */ + bool ishit = false; + vboolx valid = vfloatx(step) < vfloatx(float(N)); + const Vec4vfx p0 = curve2D.template eval0(0,N); + const Vec4vfx p1 = curve2D.template eval1(0,N); + valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w)); + + if (any(valid)) + { + Vec3vfx dp0dt = curve2D.template derivative0(0,N); + Vec3vfx dp1dt = curve2D.template derivative1(0,N); + dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt); + dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt); + const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f); + const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f); + const Vec3vfx nn0 = normalize(n0); + const Vec3vfx nn1 = normalize(n1); + const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0)); + const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1)); + const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0)); + const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1)); + + vfloatx vu,vv,vt; + vboolx valid0 = intersect_quad_backface_culling(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt); + + if (any(valid0)) + { + /* ignore self intersections */ + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) { + vfloatx r = lerp(p0.w, p1.w, vu); + valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; + } + + if (any(valid0)) + { + vv = madd(2.0f,vv,vfloatx(-1.0f)); + RibbonHit bhit(valid0,vu,vv,vt,0,N,curve3D); + ishit |= epilog(bhit.valid,bhit); + } + } + } + + if (unlikely(VSIZEX < N)) + { + /* process SIMD-size many segments per iteration */ + for (int i=VSIZEX; i(i,N); + const Vec4vfx p1 = curve2D.template eval1(i,N); + valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w)); + if (none(valid)) continue; + + Vec3vfx dp0dt = curve2D.template derivative0(i,N); + Vec3vfx dp1dt = curve2D.template derivative1(i,N); + dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt); + dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt); + const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f); + const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f); + const Vec3vfx nn0 = normalize(n0); + const Vec3vfx nn1 = normalize(n1); + const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0)); + const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1)); + const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0)); + const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1)); + + vfloatx vu,vv,vt; + vboolx valid0 = intersect_quad_backface_culling(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt); + + if (any(valid0)) + { + /* ignore self intersections */ + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) { + vfloatx r = lerp(p0.w, p1.w, vu); + valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; + } + + if (any(valid0)) + { + vv = madd(2.0f,vv,vfloatx(-1.0f)); + RibbonHit bhit(valid0,vu,vv,vt,i,N,curve3D); + ishit |= epilog(bhit.valid,bhit); + } + } + } + } + return ishit; + } + + template class NativeCurve> + struct RibbonCurve1Intersector1 + { + typedef NativeCurve NativeCurve3ff; + + template + __forceinline bool intersect(const CurvePrecalculations1& pre, Ray& ray, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3, + const Epilog& epilog) + { + const int N = geom->tessellationRate; + NativeCurve3ff curve(v0,v1,v2,v3); + curve = enlargeRadiusToMinWidth(context,geom,ray.org,curve); + return intersect_ribbon(ray.org,ray.dir,ray.tnear(),ray.tfar, + pre.ray_space,pre.depth_scale, + curve,N, + epilog); + } + }; + + template class NativeCurve, int K> + struct RibbonCurve1IntersectorK + { + typedef NativeCurve NativeCurve3ff; + + template + __forceinline bool intersect(const CurvePrecalculationsK& pre, RayK& ray, size_t k, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3, + const Epilog& epilog) + { + const int N = geom->tessellationRate; + const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]); + const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); + NativeCurve3ff curve(v0,v1,v2,v3); + curve = enlargeRadiusToMinWidth(context,geom,ray_org,curve); + return intersect_ribbon(ray_org,ray_dir,ray.tnear()[k],ray.tfar[k], + pre.ray_space[k],pre.depth_scale[k], + curve,N, + epilog); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h new file mode 100644 index 00000000000..883cedc3d2a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h @@ -0,0 +1,362 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "cylinder.h" +#include "plane.h" +#include "line_intersector.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + namespace isa + { + static const size_t numJacobianIterations = 5; +#if defined(__AVX__) + static const size_t numBezierSubdivisions = 2; +#else + static const size_t numBezierSubdivisions = 3; +#endif + + struct BezierCurveHit + { + __forceinline BezierCurveHit() {} + + __forceinline BezierCurveHit(const float t, const float u, const Vec3fa& Ng) + : t(t), u(u), v(0.0f), Ng(Ng) {} + + __forceinline BezierCurveHit(const float t, const float u, const float v, const Vec3fa& Ng) + : t(t), u(u), v(v), Ng(Ng) {} + + __forceinline void finalize() {} + + public: + float t; + float u; + float v; + Vec3fa Ng; + }; + + template + __forceinline bool intersect_bezier_iterative_debug(const Ray& ray, const float dt, const NativeCurve3ff& curve, size_t i, + const vfloatx& u, const BBox& tp, const BBox& h0, const BBox& h1, + const Vec3vfx& Ng, const Vec4vfx& dP0du, const Vec4vfx& dP3du, + const Epilog& epilog) + { + if (tp.lower[i]+dt > ray.tfar) return false; + Vec3fa Ng_o = Vec3fa(Ng.x[i],Ng.y[i],Ng.z[i]); + if (h0.lower[i] == tp.lower[i]) Ng_o = -Vec3fa(dP0du.x[i],dP0du.y[i],dP0du.z[i]); + if (h1.lower[i] == tp.lower[i]) Ng_o = +Vec3fa(dP3du.x[i],dP3du.y[i],dP3du.z[i]); + BezierCurveHit hit(tp.lower[i]+dt,u[i],Ng_o); + return epilog(hit); + } + + template + __forceinline bool intersect_bezier_iterative_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve, float u, float t, const Epilog& epilog) + { + const Vec3fa org = zero; + const Vec3fa dir = ray.dir; + const float length_ray_dir = length(dir); + + /* error of curve evaluations is propertional to largest coordinate */ + const BBox3ff box = curve.bounds(); + const float P_err = 16.0f*float(ulp)*reduce_max(max(abs(box.lower),abs(box.upper))); + + for (size_t i=0; i= 0.0f && u <= 1.0f)) return false; // rejects NaNs + const Vec3fa R = normalize(Q-P); + const Vec3fa U = madd(Vec3fa(dPdu.w),R,dPdu); + const Vec3fa V = cross(dPdu,R); + BezierCurveHit hit(t,u,cross(V,U)); + return epilog(hit); + } + } + return false; + } + + template + bool intersect_bezier_recursive_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve, + float u0, float u1, unsigned int depth, const Epilog& epilog) + { +#if defined(__AVX__) + typedef vbool8 vboolx; // maximally 8-wide to work around KNL issues + typedef vint8 vintx; + typedef vfloat8 vfloatx; +#else + typedef vbool4 vboolx; + typedef vint4 vintx; + typedef vfloat4 vfloatx; +#endif + typedef Vec3 Vec3vfx; + typedef Vec4 Vec4vfx; + + unsigned int maxDepth = numBezierSubdivisions; + bool found = false; + const Vec3fa org = zero; + const Vec3fa dir = ray.dir; + + unsigned int sptr = 0; + const unsigned int stack_size = numBezierSubdivisions+1; // +1 because of unstable workaround below + struct StackEntry { + vboolx valid; + vfloatx tlower; + float u0; + float u1; + unsigned int depth; + }; + StackEntry stack[stack_size]; + goto entry; + + /* terminate if stack is empty */ + while (sptr) + { + /* pop from stack */ + { + sptr--; + vboolx valid = stack[sptr].valid; + const vfloatx tlower = stack[sptr].tlower; + valid &= tlower+dt <= ray.tfar; + if (none(valid)) continue; + u0 = stack[sptr].u0; + u1 = stack[sptr].u1; + depth = stack[sptr].depth; + const size_t i = select_min(valid,tlower); clear(valid,i); + stack[sptr].valid = valid; + if (any(valid)) sptr++; // there are still items on the stack + + /* process next segment */ + const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1))); + u0 = vu0[i+0]; + u1 = vu0[i+1]; + } + entry: + + /* subdivide curve */ + const float dscale = (u1-u0)*(1.0f/(3.0f*(vfloatx::size-1))); + const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1))); + Vec4vfx P0, dP0du; curve.veval(vu0,P0,dP0du); dP0du = dP0du * Vec4vfx(dscale); + const Vec4vfx P3 = shift_right_1(P0); + const Vec4vfx dP3du = shift_right_1(dP0du); + const Vec4vfx P1 = P0 + dP0du; + const Vec4vfx P2 = P3 - dP3du; + + /* calculate bounding cylinders */ + const vfloatx rr1 = sqr_point_to_line_distance(Vec3vfx(dP0du),Vec3vfx(P3-P0)); + const vfloatx rr2 = sqr_point_to_line_distance(Vec3vfx(dP3du),Vec3vfx(P3-P0)); + const vfloatx maxr12 = sqrt(max(rr1,rr2)); + const vfloatx one_plus_ulp = 1.0f+2.0f*float(ulp); + const vfloatx one_minus_ulp = 1.0f-2.0f*float(ulp); + vfloatx r_outer = max(P0.w,P1.w,P2.w,P3.w)+maxr12; + vfloatx r_inner = min(P0.w,P1.w,P2.w,P3.w)-maxr12; + r_outer = one_plus_ulp*r_outer; + r_inner = max(0.0f,one_minus_ulp*r_inner); + const CylinderN cylinder_outer(Vec3vfx(P0),Vec3vfx(P3),r_outer); + const CylinderN cylinder_inner(Vec3vfx(P0),Vec3vfx(P3),r_inner); + vboolx valid = true; clear(valid,vfloatx::size-1); + + /* intersect with outer cylinder */ + BBox tc_outer; vfloatx u_outer0; Vec3vfx Ng_outer0; vfloatx u_outer1; Vec3vfx Ng_outer1; + valid &= cylinder_outer.intersect(org,dir,tc_outer,u_outer0,Ng_outer0,u_outer1,Ng_outer1); + if (none(valid)) continue; + + /* intersect with cap-planes */ + BBox tp(ray.tnear()-dt,ray.tfar-dt); + tp = embree::intersect(tp,tc_outer); + BBox h0 = HalfPlaneN(Vec3vfx(P0),+Vec3vfx(dP0du)).intersect(org,dir); + tp = embree::intersect(tp,h0); + BBox h1 = HalfPlaneN(Vec3vfx(P3),-Vec3vfx(dP3du)).intersect(org,dir); + tp = embree::intersect(tp,h1); + valid &= tp.lower <= tp.upper; + if (none(valid)) continue; + + /* clamp and correct u parameter */ + u_outer0 = clamp(u_outer0,vfloatx(0.0f),vfloatx(1.0f)); + u_outer1 = clamp(u_outer1,vfloatx(0.0f),vfloatx(1.0f)); + u_outer0 = lerp(u0,u1,(vfloatx(step)+u_outer0)*(1.0f/float(vfloatx::size))); + u_outer1 = lerp(u0,u1,(vfloatx(step)+u_outer1)*(1.0f/float(vfloatx::size))); + + /* intersect with inner cylinder */ + BBox tc_inner; + vfloatx u_inner0 = zero; Vec3vfx Ng_inner0 = zero; vfloatx u_inner1 = zero; Vec3vfx Ng_inner1 = zero; + const vboolx valid_inner = cylinder_inner.intersect(org,dir,tc_inner,u_inner0,Ng_inner0,u_inner1,Ng_inner1); + + /* at the unstable area we subdivide deeper */ + const vboolx unstable0 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner0)) < 0.3f); + const vboolx unstable1 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner1)) < 0.3f); + + /* subtract the inner interval from the current hit interval */ + BBox tp0, tp1; + subtract(tp,tc_inner,tp0,tp1); + vboolx valid0 = valid & (tp0.lower <= tp0.upper); + vboolx valid1 = valid & (tp1.lower <= tp1.upper); + if (none(valid0 | valid1)) continue; + + /* iterate over all first hits front to back */ + const vintx termDepth0 = select(unstable0,vintx(maxDepth+1),vintx(maxDepth)); + vboolx recursion_valid0 = valid0 & (depth < termDepth0); + valid0 &= depth >= termDepth0; + + while (any(valid0)) + { + const size_t i = select_min(valid0,tp0.lower); clear(valid0,i); + found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer0[i],tp0.lower[i],epilog); + //found = found | intersect_bezier_iterative_debug (ray,dt,curve,i,u_outer0,tp0,h0,h1,Ng_outer0,dP0du,dP3du,epilog); + valid0 &= tp0.lower+dt <= ray.tfar; + } + valid1 &= tp1.lower+dt <= ray.tfar; + + /* iterate over all second hits front to back */ + const vintx termDepth1 = select(unstable1,vintx(maxDepth+1),vintx(maxDepth)); + vboolx recursion_valid1 = valid1 & (depth < termDepth1); + valid1 &= depth >= termDepth1; + while (any(valid1)) + { + const size_t i = select_min(valid1,tp1.lower); clear(valid1,i); + found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer1[i],tp1.upper[i],epilog); + //found = found | intersect_bezier_iterative_debug (ray,dt,curve,i,u_outer1,tp1,h0,h1,Ng_outer1,dP0du,dP3du,epilog); + valid1 &= tp1.lower+dt <= ray.tfar; + } + + /* push valid segments to stack */ + recursion_valid0 &= tp0.lower+dt <= ray.tfar; + recursion_valid1 &= tp1.lower+dt <= ray.tfar; + const vboolx recursion_valid = recursion_valid0 | recursion_valid1; + if (any(recursion_valid)) + { + assert(sptr < stack_size); + stack[sptr].valid = recursion_valid; + stack[sptr].tlower = select(recursion_valid0,tp0.lower,tp1.lower); + stack[sptr].u0 = u0; + stack[sptr].u1 = u1; + stack[sptr].depth = depth+1; + sptr++; + } + } + return found; + } + + template class NativeCurve> + struct SweepCurve1Intersector1 + { + typedef NativeCurve NativeCurve3ff; + + template + __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3, + const Epilog& epilog) + { + STAT3(normal.trav_prims,1,1,1); + + /* move ray closer to make intersection stable */ + NativeCurve3ff curve0(v0,v1,v2,v3); + curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0); + const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir)); + const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f); + const NativeCurve3ff curve1 = curve0-ref; + return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog); + } + }; + + template class NativeCurve, int K> + struct SweepCurve1IntersectorK + { + typedef NativeCurve NativeCurve3ff; + + struct Ray1 + { + __forceinline Ray1(RayK& ray, size_t k) + : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {} + + Vec3fa org; + Vec3fa dir; + float _tnear; + float& tfar; + + __forceinline float& tnear() { return _tnear; } + //__forceinline float& tfar() { return _tfar; } + __forceinline const float& tnear() const { return _tnear; } + //__forceinline const float& tfar() const { return _tfar; } + + }; + + template + __forceinline bool intersect(const CurvePrecalculationsK& pre, RayK& vray, size_t k, + IntersectContext* context, + const CurveGeometry* geom, const unsigned int primID, + const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3, + const Epilog& epilog) + { + STAT3(normal.trav_prims,1,1,1); + Ray1 ray(vray,k); + + /* move ray closer to make intersection stable */ + NativeCurve3ff curve0(v0,v1,v2,v3); + curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0); + const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir)); + const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f); + const NativeCurve3ff curve1 = curve0-ref; + return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h new file mode 100644 index 00000000000..e1f42381306 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h @@ -0,0 +1,671 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" +#include "../subdiv/bezier_curve.h" +#include "../common/primref.h" +#include "curve_intersector_precalculations.h" +#include "../bvh/node_intersector1.h" +#include "../bvh/node_intersector_packet.h" + +#include "intersector_epilog.h" + +#include "../subdiv/bezier_curve.h" +#include "../subdiv/bspline_curve.h" +#include "../subdiv/hermite_curve.h" +#include "../subdiv/catmullrom_curve.h" + +#include "spherei_intersector.h" +#include "disci_intersector.h" + +#include "linei_intersector.h" +#include "roundlinei_intersector.h" +#include "conelinei_intersector.h" + +#include "curveNi_intersector.h" +#include "curveNv_intersector.h" +#include "curveNi_mb_intersector.h" + +#include "curve_intersector_distance.h" +#include "curve_intersector_ribbon.h" +#include "curve_intersector_oriented.h" +#include "curve_intersector_sweep.h" + +namespace embree +{ + struct VirtualCurveIntersector + { + typedef void (*Intersect1Ty)(void* pre, void* ray, IntersectContext* context, const void* primitive); + typedef bool (*Occluded1Ty )(void* pre, void* ray, IntersectContext* context, const void* primitive); + + typedef void (*Intersect4Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + typedef bool (*Occluded4Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + + typedef void (*Intersect8Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + typedef bool (*Occluded8Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + + typedef void (*Intersect16Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + typedef bool (*Occluded16Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + + public: + struct Intersectors + { + Intersectors() {} // WARNING: Do not zero initialize this, as we otherwise get problems with thread unsafe local static variable initialization (e.g. on VS2013) in curve_intersector_virtual.cpp. + + template void intersect(void* pre, void* ray, IntersectContext* context, const void* primitive); + template bool occluded (void* pre, void* ray, IntersectContext* context, const void* primitive); + + template void intersect(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + template bool occluded (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive); + + public: + Intersect1Ty intersect1; + Occluded1Ty occluded1; + Intersect4Ty intersect4; + Occluded4Ty occluded4; + Intersect8Ty intersect8; + Occluded8Ty occluded8; + Intersect16Ty intersect16; + Occluded16Ty occluded16; + }; + + Intersectors vtbl[Geometry::GTY_END]; + }; + + template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<1> (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(intersect1); intersect1(pre,ray,context,primitive); } + template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<1> (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(occluded1); return occluded1(pre,ray,context,primitive); } + + template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<4>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect4); intersect4(pre,ray,k,context,primitive); } + template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<4> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded4); return occluded4(pre,ray,k,context,primitive); } + +#if defined(__AVX__) + template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<8>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect8); intersect8(pre,ray,k,context,primitive); } + template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<8> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded8); return occluded8(pre,ray,k,context,primitive); } +#endif + +#if defined(__AVX512F__) + template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<16>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect16); intersect16(pre,ray,k,context,primitive); } + template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<16> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded16); return occluded16(pre,ray,k,context,primitive); } +#endif + + namespace isa + { + struct VirtualCurveIntersector1 + { + typedef unsigned char Primitive; + typedef CurvePrecalculations1 Precalculations; + + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; + leafIntersector.intersect<1>(&pre,&ray,context,prim); + } + + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; + return leafIntersector.occluded<1>(&pre,&ray,context,prim); + } + }; + + template + struct VirtualCurveIntersectorK + { + typedef unsigned char Primitive; + typedef CurvePrecalculationsK Precalculations; + + template + static __forceinline void intersect(const vbool& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; + size_t mask = movemask(valid_i); + while (mask) leafIntersector.intersect(&pre,&ray,bscf(mask),context,prim); + } + + template + static __forceinline vbool occluded(const vbool& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; + vbool valid_o = false; + size_t mask = movemask(valid_i); + while (mask) { + size_t k = bscf(mask); + if (leafIntersector.occluded(&pre,&ray,k,context,prim)) + set(valid_o, k); + } + return valid_o; + } + + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; + leafIntersector.intersect(&pre,&ray,k,context,prim); + } + + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + assert(num == 1); + RTCGeometryType ty = (RTCGeometryType)(*prim); + assert(This->leafIntersector); + VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty]; + return leafIntersector.occluded(&pre,&ray,k,context,prim); + } + }; + + template + static VirtualCurveIntersector::Intersectors LinearRoundConeNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiIntersector1::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &RoundLinearCurveMiIntersector1::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiIntersectorK::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &RoundLinearCurveMiIntersectorK::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiIntersectorK::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiIntersectorK::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiIntersectorK::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiIntersectorK::occluded; +#endif + return intersectors; + } + + template + static VirtualCurveIntersector::Intersectors LinearConeNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiIntersector1::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &ConeCurveMiIntersector1::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiIntersectorK::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &ConeCurveMiIntersectorK::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiIntersectorK::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiIntersectorK::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiIntersectorK::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiIntersectorK::occluded; +#endif + return intersectors; + } + + template + static VirtualCurveIntersector::Intersectors LinearRoundConeNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiMBIntersector1::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &RoundLinearCurveMiMBIntersector1::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiMBIntersectorK::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &RoundLinearCurveMiMBIntersectorK::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiMBIntersectorK::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiMBIntersectorK::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiMBIntersectorK::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiMBIntersectorK::occluded; +#endif + return intersectors; + } + + template + static VirtualCurveIntersector::Intersectors LinearConeNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiMBIntersector1::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &ConeCurveMiMBIntersector1::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiMBIntersectorK::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &ConeCurveMiMBIntersectorK::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiMBIntersectorK::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiMBIntersectorK::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiMBIntersectorK::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiMBIntersectorK::occluded; +#endif + return intersectors; + } + + + template + static VirtualCurveIntersector::Intersectors LinearRibbonNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiIntersector1::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &FlatLinearCurveMiIntersector1::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiIntersectorK::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &FlatLinearCurveMiIntersectorK::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiIntersectorK::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiIntersectorK::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiIntersectorK::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiIntersectorK::occluded; +#endif + return intersectors; + } + + template + static VirtualCurveIntersector::Intersectors LinearRibbonNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiMBIntersector1::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &FlatLinearCurveMiMBIntersector1::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiMBIntersectorK::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &FlatLinearCurveMiMBIntersectorK::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiMBIntersectorK::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiMBIntersectorK::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiMBIntersectorK::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiMBIntersectorK::occluded; +#endif + return intersectors; + } + + template + static VirtualCurveIntersector::Intersectors SphereNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiIntersector1::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &SphereMiIntersector1::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiIntersectorK::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &SphereMiIntersectorK::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiIntersectorK::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &SphereMiIntersectorK::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiIntersectorK::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &SphereMiIntersectorK::occluded; +#endif + return intersectors; + } + + template + static VirtualCurveIntersector::Intersectors SphereNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiMBIntersector1::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &SphereMiMBIntersector1::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiMBIntersectorK::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &SphereMiMBIntersectorK::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiMBIntersectorK::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &SphereMiMBIntersectorK::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiMBIntersectorK::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &SphereMiMBIntersectorK::occluded; +#endif + return intersectors; + } + + template + static VirtualCurveIntersector::Intersectors DiscNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiIntersector1::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &DiscMiIntersector1::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiIntersectorK::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &DiscMiIntersectorK::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiIntersectorK::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &DiscMiIntersectorK::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiIntersectorK::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &DiscMiIntersectorK::occluded; +#endif + return intersectors; + } + + template + static VirtualCurveIntersector::Intersectors DiscNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiMBIntersector1::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &DiscMiMBIntersector1::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiMBIntersectorK::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &DiscMiMBIntersectorK::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiMBIntersectorK::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &DiscMiMBIntersectorK::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiMBIntersectorK::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &DiscMiMBIntersectorK::occluded; +#endif + return intersectors; + } + + template + static VirtualCurveIntersector::Intersectors OrientedDiscNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiIntersector1::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &OrientedDiscMiIntersector1::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiIntersectorK::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &OrientedDiscMiIntersectorK::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiIntersectorK::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiIntersectorK::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiIntersectorK::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiIntersectorK::occluded; +#endif + return intersectors; + } + + template + static VirtualCurveIntersector::Intersectors OrientedDiscNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiMBIntersector1::intersect; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &OrientedDiscMiMBIntersector1::occluded; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiMBIntersectorK::intersect; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &OrientedDiscMiMBIntersectorK::occluded; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiMBIntersectorK::intersect; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiMBIntersectorK::occluded; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiMBIntersectorK::intersect; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiMBIntersectorK::occluded; +#endif + return intersectors; + } + + template class Curve, int N> + static VirtualCurveIntersector::Intersectors RibbonNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1::template intersect_t, Intersect1EpilogMU >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1::template occluded_t , Occluded1EpilogMU >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiIntersectorK::template intersect_t, Intersect1KEpilogMU >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK::template occluded_t , Occluded1KEpilogMU >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK::template intersect_t, Intersect1KEpilogMU >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK::template occluded_t , Occluded1KEpilogMU >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK::template intersect_t, Intersect1KEpilogMU >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK::template occluded_t , Occluded1KEpilogMU >; +#endif + return intersectors; + } + + template class Curve, int N> + static VirtualCurveIntersector::Intersectors RibbonNvIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1::template intersect_t, Intersect1EpilogMU >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNvIntersector1::template occluded_t , Occluded1EpilogMU >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNvIntersectorK::template intersect_t, Intersect1KEpilogMU >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNvIntersectorK::template occluded_t , Occluded1KEpilogMU >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK::template intersect_t, Intersect1KEpilogMU >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK::template occluded_t , Occluded1KEpilogMU >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK::template intersect_t, Intersect1KEpilogMU >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK::template occluded_t , Occluded1KEpilogMU >; +#endif + return intersectors; + } + + template class Curve, int N> + static VirtualCurveIntersector::Intersectors RibbonNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1::template intersect_t, Intersect1EpilogMU >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1::template occluded_t , Occluded1EpilogMU >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiMBIntersectorK::template intersect_t, Intersect1KEpilogMU >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK::template occluded_t , Occluded1KEpilogMU >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK::template intersect_t, Intersect1KEpilogMU >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK::template occluded_t , Occluded1KEpilogMU >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK::template intersect_t, Intersect1KEpilogMU >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK::template occluded_t , Occluded1KEpilogMU >; +#endif + return intersectors; + } + + template class Curve, int N> + static VirtualCurveIntersector::Intersectors CurveNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1::template intersect_t, Intersect1Epilog1 >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1::template occluded_t , Occluded1Epilog1 >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK::template intersect_t, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK::template occluded_t , Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK::template intersect_t, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK::template occluded_t , Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK::template intersect_t, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK::template occluded_t , Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template class Curve, int N> + static VirtualCurveIntersector::Intersectors CurveNvIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1::template intersect_t, Intersect1Epilog1 >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNvIntersector1::template occluded_t , Occluded1Epilog1 >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNvIntersectorK::template intersect_t, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNvIntersectorK::template occluded_t , Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK::template intersect_t, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK::template occluded_t , Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK::template intersect_t, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK::template occluded_t , Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template class Curve, int N> + static VirtualCurveIntersector::Intersectors CurveNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1::template intersect_t, Intersect1Epilog1 >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1::template occluded_t , Occluded1Epilog1 >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK::template intersect_t, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK::template occluded_t , Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK::template intersect_t, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK::template occluded_t , Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK::template intersect_t, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK::template occluded_t , Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template class Curve, int N> + static VirtualCurveIntersector::Intersectors OrientedCurveNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1::template intersect_n, Intersect1Epilog1 >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1::template occluded_n , Occluded1Epilog1 >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK::template intersect_n, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK::template occluded_n , Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK::template intersect_n, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK::template occluded_n , Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK::template intersect_n, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK::template occluded_n , Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template class Curve, int N> + static VirtualCurveIntersector::Intersectors OrientedCurveNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1::template intersect_n, Intersect1Epilog1 >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1::template occluded_n , Occluded1Epilog1 >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK::template intersect_n, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK::template occluded_n , Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK::template intersect_n, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK::template occluded_n , Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK::template intersect_n, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK::template occluded_n , Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template class Curve, int N> + static VirtualCurveIntersector::Intersectors HermiteRibbonNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1::template intersect_h, Intersect1EpilogMU >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1::template occluded_h , Occluded1EpilogMU >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK::template intersect_h, Intersect1KEpilogMU >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK::template occluded_h , Occluded1KEpilogMU >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK::template intersect_h, Intersect1KEpilogMU >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK::template occluded_h , Occluded1KEpilogMU >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK::template intersect_h, Intersect1KEpilogMU >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK::template occluded_h , Occluded1KEpilogMU >; +#endif + return intersectors; + } + + template class Curve, int N> + static VirtualCurveIntersector::Intersectors HermiteRibbonNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1::template intersect_h, Intersect1EpilogMU >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1::template occluded_h , Occluded1EpilogMU >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK::template intersect_h, Intersect1KEpilogMU >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK::template occluded_h , Occluded1KEpilogMU >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK::template intersect_h, Intersect1KEpilogMU >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK::template occluded_h , Occluded1KEpilogMU >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK::template intersect_h, Intersect1KEpilogMU >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK::template occluded_h , Occluded1KEpilogMU >; +#endif + return intersectors; + } + + template class Curve, int N> + static VirtualCurveIntersector::Intersectors HermiteCurveNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1::template intersect_h, Intersect1Epilog1 >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1::template occluded_h , Occluded1Epilog1 >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK::template intersect_h, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK::template occluded_h , Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK::template intersect_h, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK::template occluded_h , Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK::template intersect_h, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK::template occluded_h , Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template class Curve, int N> + static VirtualCurveIntersector::Intersectors HermiteCurveNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1::template intersect_h, Intersect1Epilog1 >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1::template occluded_h , Occluded1Epilog1 >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK::template intersect_h, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK::template occluded_h , Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK::template intersect_h, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK::template occluded_h , Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK::template intersect_h, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK::template occluded_h , Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template class Curve, int N> + static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1::template intersect_hn, Intersect1Epilog1 >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiIntersector1::template occluded_hn , Occluded1Epilog1 >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK::template intersect_hn, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK::template occluded_hn , Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK::template intersect_hn, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK::template occluded_hn , Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK::template intersect_hn, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK::template occluded_hn , Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + + template class Curve, int N> + static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiMBIntersectors() + { + VirtualCurveIntersector::Intersectors intersectors; + intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1::template intersect_hn, Intersect1Epilog1 >; + intersectors.occluded1 = (VirtualCurveIntersector::Occluded1Ty) &CurveNiMBIntersector1::template occluded_hn , Occluded1Epilog1 >; + intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK::template intersect_hn, Intersect1KEpilog1<4,true> >; + intersectors.occluded4 = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK::template occluded_hn , Occluded1KEpilog1<4,true> >; +#if defined(__AVX__) + intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK::template intersect_hn, Intersect1KEpilog1<8,true> >; + intersectors.occluded8 = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK::template occluded_hn , Occluded1KEpilog1<8,true> >; +#endif +#if defined(__AVX512F__) + intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK::template intersect_hn, Intersect1KEpilog1<16,true> >; + intersectors.occluded16 = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK::template occluded_hn , Occluded1KEpilog1<16,true> >; +#endif + return intersectors; + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h new file mode 100644 index 00000000000..69cf612275f --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h @@ -0,0 +1,21 @@ +// Copyright 2020 Light Transport Entertainment Inc. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "curve_intersector_virtual.h" + +namespace embree +{ + namespace isa + { + void AddVirtualCurveBezierCurveInterector4i(VirtualCurveIntersector &prim); + void AddVirtualCurveBezierCurveInterector4v(VirtualCurveIntersector &prim); + void AddVirtualCurveBezierCurveInterector4iMB(VirtualCurveIntersector &prim); +#if defined(__AVX__) + void AddVirtualCurveBezierCurveInterector8i(VirtualCurveIntersector &prim); + void AddVirtualCurveBezierCurveInterector8v(VirtualCurveIntersector &prim); + void AddVirtualCurveBezierCurveInterector8iMB(VirtualCurveIntersector &prim); +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h new file mode 100644 index 00000000000..d37e41098ed --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h @@ -0,0 +1,21 @@ +// Copyright 2020 Light Transport Entertainment Inc. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "curve_intersector_virtual.h" + +namespace embree +{ + namespace isa + { + void AddVirtualCurveBSplineCurveInterector4i(VirtualCurveIntersector &prim); + void AddVirtualCurveBSplineCurveInterector4v(VirtualCurveIntersector &prim); + void AddVirtualCurveBSplineCurveInterector4iMB(VirtualCurveIntersector &prim); +#if defined(__AVX__) + void AddVirtualCurveBSplineCurveInterector8i(VirtualCurveIntersector &prim); + void AddVirtualCurveBSplineCurveInterector8v(VirtualCurveIntersector &prim); + void AddVirtualCurveBSplineCurveInterector8iMB(VirtualCurveIntersector &prim); +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h new file mode 100644 index 00000000000..a133a11d63e --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h @@ -0,0 +1,21 @@ +// Copyright 2020 Light Transport Entertainment Inc. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "curve_intersector_virtual.h" + +namespace embree +{ + namespace isa + { + void AddVirtualCurveCatmullRomCurveInterector4i(VirtualCurveIntersector &prim); + void AddVirtualCurveCatmullRomCurveInterector4v(VirtualCurveIntersector &prim); + void AddVirtualCurveCatmullRomCurveInterector4iMB(VirtualCurveIntersector &prim); +#if defined(__AVX__) + void AddVirtualCurveCatmullRomCurveInterector8i(VirtualCurveIntersector &prim); + void AddVirtualCurveCatmullRomCurveInterector8v(VirtualCurveIntersector &prim); + void AddVirtualCurveCatmullRomCurveInterector8iMB(VirtualCurveIntersector &prim); +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h new file mode 100644 index 00000000000..9aec35da452 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h @@ -0,0 +1,21 @@ +// Copyright 2020 Light Transport Entertainment Inc. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "curve_intersector_virtual.h" + +namespace embree +{ + namespace isa + { + void AddVirtualCurveHermiteCurveInterector4i(VirtualCurveIntersector &prim); + void AddVirtualCurveHermiteCurveInterector4v(VirtualCurveIntersector &prim); + void AddVirtualCurveHermiteCurveInterector4iMB(VirtualCurveIntersector &prim); +#if defined(__AVX__) + void AddVirtualCurveHermiteCurveInterector8i(VirtualCurveIntersector &prim); + void AddVirtualCurveHermiteCurveInterector8v(VirtualCurveIntersector &prim); + void AddVirtualCurveHermiteCurveInterector8iMB(VirtualCurveIntersector &prim); +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h new file mode 100644 index 00000000000..dd37d194f5d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h @@ -0,0 +1,21 @@ +// Copyright 2020 Light Transport Entertainment Inc. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "curve_intersector_virtual.h" + +namespace embree +{ + namespace isa + { + void AddVirtualCurveLinearCurveInterector4i(VirtualCurveIntersector &prim); + void AddVirtualCurveLinearCurveInterector4v(VirtualCurveIntersector &prim); + void AddVirtualCurveLinearCurveInterector4iMB(VirtualCurveIntersector &prim); +#if defined(__AVX__) + void AddVirtualCurveLinearCurveInterector8i(VirtualCurveIntersector &prim); + void AddVirtualCurveLinearCurveInterector8v(VirtualCurveIntersector &prim); + void AddVirtualCurveLinearCurveInterector8iMB(VirtualCurveIntersector &prim); +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h new file mode 100644 index 00000000000..fe5ceed840a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h @@ -0,0 +1,22 @@ +// Copyright 2020 Light Transport Entertainment Inc. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "curve_intersector_virtual.h" + +namespace embree +{ + namespace isa + { + void AddVirtualCurvePointInterector4i(VirtualCurveIntersector &prim); + void AddVirtualCurvePointInterector4v(VirtualCurveIntersector &prim); + void AddVirtualCurvePointInterector4iMB(VirtualCurveIntersector &prim); + +#if defined (__AVX__) + void AddVirtualCurvePointInterector8i(VirtualCurveIntersector &prim); + void AddVirtualCurvePointInterector8v(VirtualCurveIntersector &prim); + void AddVirtualCurvePointInterector8iMB(VirtualCurveIntersector &prim); +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/cylinder.h b/thirdparty/embree-aarch64/kernels/geometry/cylinder.h new file mode 100644 index 00000000000..39a582864c5 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/cylinder.h @@ -0,0 +1,223 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" + +namespace embree +{ + namespace isa + { + struct Cylinder + { + const Vec3fa p0; //!< start location + const Vec3fa p1; //!< end position + const float rr; //!< squared radius of cylinder + + __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float r) + : p0(p0), p1(p1), rr(sqr(r)) {} + + __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float rr, bool) + : p0(p0), p1(p1), rr(rr) {} + + __forceinline bool intersect(const Vec3fa& org, + const Vec3fa& dir, + BBox1f& t_o, + float& u0_o, Vec3fa& Ng0_o, + float& u1_o, Vec3fa& Ng1_o) const + { + /* calculate quadratic equation to solve */ + const float rl = rcp_length(p1-p0); + const Vec3fa P0 = p0, dP = (p1-p0)*rl; + const Vec3fa O = org-P0, dO = dir; + + const float dOdO = dot(dO,dO); + const float OdO = dot(dO,O); + const float OO = dot(O,O); + const float dOz = dot(dP,dO); + const float Oz = dot(dP,O); + + const float A = dOdO - sqr(dOz); + const float B = 2.0f * (OdO - dOz*Oz); + const float C = OO - sqr(Oz) - rr; + + /* we miss the cylinder if determinant is smaller than zero */ + const float D = B*B - 4.0f*A*C; + if (D < 0.0f) { + t_o = BBox1f(pos_inf,neg_inf); + return false; + } + + /* special case for rays that are parallel to the cylinder */ + const float eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz))); + if (abs(A) < eps) + { + if (C <= 0.0f) { + t_o = BBox1f(neg_inf,pos_inf); + return true; + } else { + t_o = BBox1f(pos_inf,neg_inf); + return false; + } + } + + /* standard case for rays that are not parallel to the cylinder */ + const float Q = sqrt(D); + const float rcp_2A = rcp(2.0f*A); + const float t0 = (-B-Q)*rcp_2A; + const float t1 = (-B+Q)*rcp_2A; + + /* calculates u and Ng for near hit */ + { + u0_o = madd(t0,dOz,Oz)*rl; + const Vec3fa Pr = t0*dir; + const Vec3fa Pl = madd(u0_o,p1-p0,p0); + Ng0_o = Pr-Pl; + } + + /* calculates u and Ng for far hit */ + { + u1_o = madd(t1,dOz,Oz)*rl; + const Vec3fa Pr = t1*dir; + const Vec3fa Pl = madd(u1_o,p1-p0,p0); + Ng1_o = Pr-Pl; + } + + t_o.lower = t0; + t_o.upper = t1; + return true; + } + + __forceinline bool intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox1f& t_o) const + { + float u0_o; Vec3fa Ng0_o; + float u1_o; Vec3fa Ng1_o; + return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o); + } + + static bool verify(const size_t id, const Cylinder& cylinder, const RayHit& ray, bool shouldhit, const float t0, const float t1) + { + float eps = 0.001f; + BBox1f t; bool hit; + hit = cylinder.intersect(ray.org,ray.dir,t); + + bool failed = hit != shouldhit; + if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : abs(t0-t.lower) > eps; + if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : abs(t1-t.upper) > eps; + if (!failed) return true; + embree_cout << "Cylinder test " << id << " failed: cylinder = " << cylinder << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl; + return false; + } + + /* verify cylinder class */ + static bool verify() + { + bool passed = true; + const Cylinder cylinder(Vec3fa(0.0f,0.0f,0.0f),Vec3fa(1.0f,0.0f,0.0f),1.0f); + passed &= verify(0,cylinder,RayHit(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f); + passed &= verify(1,cylinder,RayHit(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f); + passed &= verify(2,cylinder,RayHit(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f); + passed &= verify(3,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf); + passed &= verify(4,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf); + passed &= verify(5,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf); + passed &= verify(6,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf); + return passed; + } + + /*! output operator */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cylinder& c) { + return cout << "Cylinder { p0 = " << c.p0 << ", p1 = " << c.p1 << ", r = " << sqrtf(c.rr) << "}"; + } + }; + + template + struct CylinderN + { + const Vec3vf p0; //!< start location + const Vec3vf p1; //!< end position + const vfloat rr; //!< squared radius of cylinder + + __forceinline CylinderN(const Vec3vf& p0, const Vec3vf& p1, const vfloat& r) + : p0(p0), p1(p1), rr(sqr(r)) {} + + __forceinline CylinderN(const Vec3vf& p0, const Vec3vf& p1, const vfloat& rr, bool) + : p0(p0), p1(p1), rr(rr) {} + + + __forceinline vbool intersect(const Vec3fa& org, const Vec3fa& dir, + BBox>& t_o, + vfloat& u0_o, Vec3vf& Ng0_o, + vfloat& u1_o, Vec3vf& Ng1_o) const + { + /* calculate quadratic equation to solve */ + const vfloat rl = rcp_length(p1-p0); + const Vec3vf P0 = p0, dP = (p1-p0)*rl; + const Vec3vf O = Vec3vf(org)-P0, dO = dir; + + const vfloat dOdO = dot(dO,dO); + const vfloat OdO = dot(dO,O); + const vfloat OO = dot(O,O); + const vfloat dOz = dot(dP,dO); + const vfloat Oz = dot(dP,O); + + const vfloat A = dOdO - sqr(dOz); + const vfloat B = 2.0f * (OdO - dOz*Oz); + const vfloat C = OO - sqr(Oz) - rr; + + /* we miss the cylinder if determinant is smaller than zero */ + const vfloat D = B*B - 4.0f*A*C; + vbool valid = D >= 0.0f; + if (none(valid)) { + t_o = BBox>(empty); + return valid; + } + + /* standard case for rays that are not parallel to the cylinder */ + const vfloat Q = sqrt(D); + const vfloat rcp_2A = rcp(2.0f*A); + const vfloat t0 = (-B-Q)*rcp_2A; + const vfloat t1 = (-B+Q)*rcp_2A; + + /* calculates u and Ng for near hit */ + { + u0_o = madd(t0,dOz,Oz)*rl; + const Vec3vf Pr = t0*Vec3vf(dir); + const Vec3vf Pl = madd(u0_o,p1-p0,p0); + Ng0_o = Pr-Pl; + } + + /* calculates u and Ng for far hit */ + { + u1_o = madd(t1,dOz,Oz)*rl; + const Vec3vf Pr = t1*Vec3vf(dir); + const Vec3vf Pl = madd(u1_o,p1-p0,p0); + Ng1_o = Pr-Pl; + } + + t_o.lower = select(valid, t0, vfloat(pos_inf)); + t_o.upper = select(valid, t1, vfloat(neg_inf)); + + /* special case for rays that are parallel to the cylinder */ + const vfloat eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz))); + vbool validt = valid & (abs(A) < eps); + if (unlikely(any(validt))) + { + vbool inside = C <= 0.0f; + t_o.lower = select(validt,select(inside,vfloat(neg_inf),vfloat(pos_inf)),t_o.lower); + t_o.upper = select(validt,select(inside,vfloat(pos_inf),vfloat(neg_inf)),t_o.upper); + valid &= !validt | inside; + } + return valid; + } + + __forceinline vbool intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox>& t_o) const + { + vfloat u0_o; Vec3vf Ng0_o; + vfloat u1_o; Vec3vf Ng1_o; + return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o); + } + }; + } +} + diff --git a/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h new file mode 100644 index 00000000000..e8305780e58 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h @@ -0,0 +1,216 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "../common/scene_points.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + namespace isa + { + template + struct DiscIntersectorHitM + { + __forceinline DiscIntersectorHitM() {} + + __forceinline DiscIntersectorHitM(const vfloat& u, const vfloat& v, const vfloat& t, const Vec3vf& Ng) + : vu(u), vv(v), vt(t), vNg(Ng) + { + } + + __forceinline void finalize() {} + + __forceinline Vec2f uv(const size_t i) const + { + return Vec2f(vu[i], vv[i]); + } + __forceinline float t(const size_t i) const + { + return vt[i]; + } + __forceinline Vec3fa Ng(const size_t i) const + { + return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]); + } + + public: + vfloat vu; + vfloat vv; + vfloat vt; + Vec3vf vNg; + }; + + template + struct DiscIntersector1 + { + typedef CurvePrecalculations1 Precalculations; + + template + static __forceinline bool intersect( + const vbool& valid_i, + Ray& ray, + IntersectContext* context, + const Points* geom, + const Precalculations& pre, + const Vec4vf& v0i, + const Epilog& epilog) + { + vbool valid = valid_i; + + const Vec3vf ray_org(ray.org.x, ray.org.y, ray.org.z); + const Vec3vf ray_dir(ray.dir.x, ray.dir.y, ray.dir.z); + const vfloat rd2 = rcp(dot(ray_dir, ray_dir)); + + const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); + const Vec3vf center = v0.xyz(); + const vfloat radius = v0.w; + + const Vec3vf c0 = center - ray_org; + const vfloat projC0 = dot(c0, ray_dir) * rd2; + + valid &= (vfloat(ray.tnear()) <= projC0) & (projC0 <= vfloat(ray.tfar)); + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) + valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale; // ignore self intersections + if (unlikely(none(valid))) + return false; + + const Vec3vf perp = c0 - projC0 * ray_dir; + const vfloat l2 = dot(perp, perp); + const vfloat r2 = radius * radius; + valid &= (l2 <= r2); + if (unlikely(none(valid))) + return false; + + DiscIntersectorHitM hit(zero, zero, projC0, -ray_dir); + return epilog(valid, hit); + } + + template + static __forceinline bool intersect(const vbool& valid_i, + Ray& ray, + IntersectContext* context, + const Points* geom, + const Precalculations& pre, + const Vec4vf& v0i, + const Vec3vf& normal, + const Epilog& epilog) + { + vbool valid = valid_i; + const Vec3vf ray_org(ray.org.x, ray.org.y, ray.org.z); + + const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); + const Vec3vf center = v0.xyz(); + const vfloat radius = v0.w; + + vfloat divisor = dot(Vec3vf((Vec3fa)ray.dir), normal); + const vbool parallel = divisor == vfloat(0.f); + valid &= !parallel; + divisor = select(parallel, 1.f, divisor); // prevent divide by zero + + vfloat t = dot(center - Vec3vf((Vec3fa)ray.org), Vec3vf(normal)) / divisor; + + valid &= (vfloat(ray.tnear()) <= t) & (t <= vfloat(ray.tfar)); + if (unlikely(none(valid))) + return false; + + Vec3vf intersection = Vec3vf((Vec3fa)ray.org) + Vec3vf((Vec3fa)ray.dir) * t; + vfloat dist2 = dot(intersection - center, intersection - center); + valid &= dist2 < radius * radius; + if (unlikely(none(valid))) + return false; + + DiscIntersectorHitM hit(zero, zero, t, normal); + return epilog(valid, hit); + } + }; + + template + struct DiscIntersectorK + { + typedef CurvePrecalculationsK Precalculations; + + template + static __forceinline bool intersect(const vbool& valid_i, + RayK& ray, + size_t k, + IntersectContext* context, + const Points* geom, + const Precalculations& pre, + const Vec4vf& v0i, + const Epilog& epilog) + { + vbool valid = valid_i; + + const Vec3vf ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const Vec3vf ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); + const vfloat rd2 = rcp(dot(ray_dir, ray_dir)); + + const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); + const Vec3vf center = v0.xyz(); + const vfloat radius = v0.w; + + const Vec3vf c0 = center - ray_org; + const vfloat projC0 = dot(c0, ray_dir) * rd2; + + valid &= (vfloat(ray.tnear()[k]) <= projC0) & (projC0 <= vfloat(ray.tfar[k])); + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) + valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale[k]; // ignore self intersections + if (unlikely(none(valid))) + return false; + + const Vec3vf perp = c0 - projC0 * ray_dir; + const vfloat l2 = dot(perp, perp); + const vfloat r2 = radius * radius; + valid &= (l2 <= r2); + if (unlikely(none(valid))) + return false; + + DiscIntersectorHitM hit(zero, zero, projC0, -ray_dir); + return epilog(valid, hit); + } + + template + static __forceinline bool intersect(const vbool& valid_i, + RayK& ray, + size_t k, + IntersectContext* context, + const Points* geom, + const Precalculations& pre, + const Vec4vf& v0i, + const Vec3vf& normal, + const Epilog& epilog) + { + vbool valid = valid_i; + const Vec3vf ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const Vec3vf ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); + + const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); + const Vec3vf center = v0.xyz(); + const vfloat radius = v0.w; + + vfloat divisor = dot(Vec3vf(ray_dir), normal); + const vbool parallel = divisor == vfloat(0.f); + valid &= !parallel; + divisor = select(parallel, 1.f, divisor); // prevent divide by zero + + vfloat t = dot(center - Vec3vf(ray_org), Vec3vf(normal)) / divisor; + + valid &= (vfloat(ray.tnear()[k]) <= t) & (t <= vfloat(ray.tfar[k])); + if (unlikely(none(valid))) + return false; + + Vec3vf intersection = Vec3vf(ray_org) + Vec3vf(ray_dir) * t; + vfloat dist2 = dot(intersection - center, intersection - center); + valid &= dist2 < radius * radius; + if (unlikely(none(valid))) + return false; + + DiscIntersectorHitM hit(zero, zero, t, normal); + return epilog(valid, hit); + } + }; + } // namespace isa +} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h new file mode 100644 index 00000000000..e1dc3aa98ec --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h @@ -0,0 +1,277 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "disc_intersector.h" +#include "intersector_epilog.h" +#include "pointi.h" + +namespace embree +{ + namespace isa + { + template + struct DiscMiIntersector1 + { + typedef PointMi Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, + RayHit& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Disc.gather(v0, geom); + const vbool valid = Disc.template valid(); + DiscIntersector1::intersect( + valid, ray, context, geom, pre, v0, Intersect1EpilogM(ray, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, + Ray& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Disc.gather(v0, geom); + const vbool valid = Disc.template valid(); + return DiscIntersector1::intersect( + valid, ray, context, geom, pre, v0, Occluded1EpilogM(ray, context, Disc.geomID(), Disc.primID())); + } + }; + + template + struct DiscMiMBIntersector1 + { + typedef PointMi Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, + RayHit& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Disc.gather(v0, geom, ray.time()); + const vbool valid = Disc.template valid(); + DiscIntersector1::intersect( + valid, ray, context, geom, pre, v0, Intersect1EpilogM(ray, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, + Ray& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Disc.gather(v0, geom, ray.time()); + const vbool valid = Disc.template valid(); + return DiscIntersector1::intersect( + valid, ray, context, geom, pre, v0, Occluded1EpilogM(ray, context, Disc.geomID(), Disc.primID())); + } + }; + + template + struct DiscMiIntersectorK + { + typedef PointMi Primitive; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline void intersect( + const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Disc.gather(v0, geom); + const vbool valid = Disc.template valid(); + DiscIntersectorK::intersect( + valid, ray, k, context, geom, pre, v0, + Intersect1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded( + const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Disc.gather(v0, geom); + const vbool valid = Disc.template valid(); + return DiscIntersectorK::intersect( + valid, ray, k, context, geom, pre, v0, + Occluded1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); + } + }; + + template + struct DiscMiMBIntersectorK + { + typedef PointMi Primitive; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline void intersect( + const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Disc.gather(v0, geom, ray.time()[k]); + const vbool valid = Disc.template valid(); + DiscIntersectorK::intersect( + valid, ray, k, context, geom, pre, v0, + Intersect1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded( + const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Disc.gather(v0, geom, ray.time()[k]); + const vbool valid = Disc.template valid(); + return DiscIntersectorK::intersect( + valid, ray, k, context, geom, pre, v0, Occluded1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); + } + }; + + template + struct OrientedDiscMiIntersector1 + { + typedef PointMi Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, + RayHit& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Vec3vf n0; + Disc.gather(v0, n0, geom); + const vbool valid = Disc.template valid(); + DiscIntersector1::intersect( + valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM(ray, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, + Ray& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Vec3vf n0; + Disc.gather(v0, n0, geom); + const vbool valid = Disc.template valid(); + return DiscIntersector1::intersect( + valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM(ray, context, Disc.geomID(), Disc.primID())); + } + }; + + template + struct OrientedDiscMiMBIntersector1 + { + typedef PointMi Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, + RayHit& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Vec3vf n0; + Disc.gather(v0, n0, geom, ray.time()); + const vbool valid = Disc.template valid(); + DiscIntersector1::intersect( + valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM(ray, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, + Ray& ray, + IntersectContext* context, + const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Vec3vf n0; + Disc.gather(v0, n0, geom, ray.time()); + const vbool valid = Disc.template valid(); + return DiscIntersector1::intersect( + valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM(ray, context, Disc.geomID(), Disc.primID())); + } + }; + + template + struct OrientedDiscMiIntersectorK + { + typedef PointMi Primitive; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline void intersect( + const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Vec3vf n0; + Disc.gather(v0, n0, geom); + const vbool valid = Disc.template valid(); + DiscIntersectorK::intersect( + valid, ray, k, context, geom, pre, v0, n0, + Intersect1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded( + const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Vec3vf n0; + Disc.gather(v0, n0, geom); + const vbool valid = Disc.template valid(); + return DiscIntersectorK::intersect( + valid, ray, k, context, geom, pre, v0, n0, + Occluded1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); + } + }; + + template + struct OrientedDiscMiMBIntersectorK + { + typedef PointMi Primitive; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline void intersect( + const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Vec3vf n0; + Disc.gather(v0, n0, geom, ray.time()[k]); + const vbool valid = Disc.template valid(); + DiscIntersectorK::intersect( + valid, ray, k, context, geom, pre, v0, n0, + Intersect1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); + } + + static __forceinline bool occluded( + const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& Disc) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(Disc.geomID()); + Vec4vf v0; Vec3vf n0; + Disc.gather(v0, n0, geom, ray.time()[k]); + const vbool valid = Disc.template valid(); + return DiscIntersectorK::intersect( + valid, ray, k, context, geom, pre, v0, n0, + Occluded1KEpilogM(ray, k, context, Disc.geomID(), Disc.primID())); + } + }; + } // namespace isa +} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/geometry/filter.h b/thirdparty/embree-aarch64/kernels/geometry/filter.h new file mode 100644 index 00000000000..4cdf7a395a4 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/filter.h @@ -0,0 +1,204 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/geometry.h" +#include "../common/ray.h" +#include "../common/hit.h" +#include "../common/context.h" + +namespace embree +{ + namespace isa + { + __forceinline bool runIntersectionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context) + { + if (geometry->intersectionFilterN) + { + assert(context->scene->hasGeometryFilterFunction()); + geometry->intersectionFilterN(args); + + if (args->valid[0] == 0) + return false; + } + + if (context->user->filter) { + assert(context->scene->hasContextFilterFunction()); + context->user->filter(args); + + if (args->valid[0] == 0) + return false; + } + + copyHitToRay(*(RayHit*)args->ray,*(Hit*)args->hit); + return true; + } + + __forceinline bool runIntersectionFilter1(const Geometry* const geometry, RayHit& ray, IntersectContext* context, Hit& hit) + { + RTCFilterFunctionNArguments args; + int mask = -1; + args.valid = &mask; + args.geometryUserPtr = geometry->userPtr; + args.context = context->user; + args.ray = (RTCRayN*)&ray; + args.hit = (RTCHitN*)&hit; + args.N = 1; + return runIntersectionFilter1Helper(&args,geometry,context); + } + + __forceinline void reportIntersection1(IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args) + { +#if defined(EMBREE_FILTER_FUNCTION) + IntersectContext* MAYBE_UNUSED context = args->internal_context; + const Geometry* const geometry = args->geometry; + if (geometry->intersectionFilterN) { + assert(context->scene->hasGeometryFilterFunction()); + geometry->intersectionFilterN(filter_args); + } + + //if (args->valid[0] == 0) + // return; + + if (context->user->filter) { + assert(context->scene->hasContextFilterFunction()); + context->user->filter(filter_args); + } +#endif + } + + __forceinline bool runOcclusionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context) + { + if (geometry->occlusionFilterN) + { + assert(context->scene->hasGeometryFilterFunction()); + geometry->occlusionFilterN(args); + + if (args->valid[0] == 0) + return false; + } + + if (context->user->filter) { + assert(context->scene->hasContextFilterFunction()); + context->user->filter(args); + + if (args->valid[0] == 0) + return false; + } + return true; + } + + __forceinline bool runOcclusionFilter1(const Geometry* const geometry, Ray& ray, IntersectContext* context, Hit& hit) + { + RTCFilterFunctionNArguments args; + int mask = -1; + args.valid = &mask; + args.geometryUserPtr = geometry->userPtr; + args.context = context->user; + args.ray = (RTCRayN*)&ray; + args.hit = (RTCHitN*)&hit; + args.N = 1; + return runOcclusionFilter1Helper(&args,geometry,context); + } + + __forceinline void reportOcclusion1(OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args) + { +#if defined(EMBREE_FILTER_FUNCTION) + IntersectContext* MAYBE_UNUSED context = args->internal_context; + const Geometry* const geometry = args->geometry; + if (geometry->occlusionFilterN) { + assert(context->scene->hasGeometryFilterFunction()); + geometry->occlusionFilterN(filter_args); + } + + //if (args->valid[0] == 0) + // return false; + + if (context->user->filter) { + assert(context->scene->hasContextFilterFunction()); + context->user->filter(filter_args); + } +#endif + } + + template + __forceinline vbool runIntersectionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context) + { + vint* mask = (vint*) args->valid; + if (geometry->intersectionFilterN) + { + assert(context->scene->hasGeometryFilterFunction()); + geometry->intersectionFilterN(args); + } + + vbool valid_o = *mask != vint(zero); + if (none(valid_o)) return valid_o; + + if (context->user->filter) { + assert(context->scene->hasContextFilterFunction()); + context->user->filter(args); + } + + valid_o = *mask != vint(zero); + if (none(valid_o)) return valid_o; + + copyHitToRay(valid_o,*(RayHitK*)args->ray,*(HitK*)args->hit); + return valid_o; + } + + template + __forceinline vbool runIntersectionFilter(const vbool& valid, const Geometry* const geometry, RayHitK& ray, IntersectContext* context, HitK& hit) + { + RTCFilterFunctionNArguments args; + vint mask = valid.mask32(); + args.valid = (int*)&mask; + args.geometryUserPtr = geometry->userPtr; + args.context = context->user; + args.ray = (RTCRayN*)&ray; + args.hit = (RTCHitN*)&hit; + args.N = K; + return runIntersectionFilterHelper(&args,geometry,context); + } + + template + __forceinline vbool runOcclusionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context) + { + vint* mask = (vint*) args->valid; + if (geometry->occlusionFilterN) + { + assert(context->scene->hasGeometryFilterFunction()); + geometry->occlusionFilterN(args); + } + + vbool valid_o = *mask != vint(zero); + + if (none(valid_o)) return valid_o; + + if (context->user->filter) { + assert(context->scene->hasContextFilterFunction()); + context->user->filter(args); + } + + valid_o = *mask != vint(zero); + + RayK* ray = (RayK*) args->ray; + ray->tfar = select(valid_o, vfloat(neg_inf), ray->tfar); + return valid_o; + } + + template + __forceinline vbool runOcclusionFilter(const vbool& valid, const Geometry* const geometry, RayK& ray, IntersectContext* context, HitK& hit) + { + RTCFilterFunctionNArguments args; + vint mask = valid.mask32(); + args.valid = (int*)&mask; + args.geometryUserPtr = geometry->userPtr; + args.context = context->user; + args.ray = (RTCRayN*)&ray; + args.hit = (RTCHitN*)&hit; + args.N = K; + return runOcclusionFilterHelper(&args,geometry,context); + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h new file mode 100644 index 00000000000..46a0af08279 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h @@ -0,0 +1,99 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "grid_soa.h" +#include "grid_soa_intersector1.h" +#include "grid_soa_intersector_packet.h" +#include "../common/ray.h" + +namespace embree +{ + namespace isa + { + template + class SubdivPatch1Precalculations : public T + { + public: + __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr) + : T(ray,ptr) {} + }; + + template + class SubdivPatch1PrecalculationsK : public T + { + public: + __forceinline SubdivPatch1PrecalculationsK (const vbool& valid, RayK& ray) + : T(valid,ray) {} + }; + + class Grid1Intersector1 + { + public: + typedef GridSOA Primitive; + typedef Grid1Precalculations Precalculations; + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) + { + GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node); + } + static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) { + intersect(pre,ray,context,prim,ty,lazy_node); + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) + { + GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node); + } + static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) { + return occluded(pre,ray,context,prim,ty,lazy_node); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) { + assert(false && "not implemented"); + return false; + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) { + assert(false && "not implemented"); + return false; + } + }; + + template + struct GridIntersectorK + { + typedef GridSOA Primitive; + typedef SubdivPatch1PrecalculationsK::Precalculations> Precalculations; + + + static __forceinline void intersect(const vbool& valid, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) + { + GridSOAIntersectorK::intersect(valid,pre,ray,context,prim,lazy_node); + } + + static __forceinline vbool occluded(const vbool& valid, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) + { + GridSOAIntersectorK::occluded(valid,pre,ray,context,prim,lazy_node); + } + + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) + { + GridSOAIntersectorK::intersect(pre,ray,k,context,prim,lazy_node); + } + + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) + { + GridSOAIntersectorK::occluded(pre,ray,k,context,prim,lazy_node); + } + }; + + typedef Grid1IntersectorK<4> SubdivPatch1Intersector4; + typedef Grid1IntersectorK<8> SubdivPatch1Intersector8; + typedef Grid1IntersectorK<16> SubdivPatch1Intersector16; + + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h new file mode 100644 index 00000000000..d3b275586c9 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h @@ -0,0 +1,275 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "../common/scene_subdiv_mesh.h" +#include "../bvh/bvh.h" +#include "../subdiv/tessellation.h" +#include "../subdiv/tessellation_cache.h" +#include "subdivpatch1.h" + +namespace embree +{ + namespace isa + { + class GridSOA + { + public: + + /*! GridSOA constructor */ + GridSOA(const SubdivPatch1Base* patches, const unsigned time_steps, + const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, + const SubdivMesh* const geom, const size_t totalBvhBytes, const size_t gridBytes, BBox3fa* bounds_o = nullptr); + + /*! Subgrid creation */ + template + static GridSOA* create(const SubdivPatch1Base* patches, const unsigned time_steps, + unsigned x0, unsigned x1, unsigned y0, unsigned y1, + const Scene* scene, Allocator& alloc, BBox3fa* bounds_o = nullptr) + { + const unsigned width = x1-x0+1; + const unsigned height = y1-y0+1; + const GridRange range(0,width-1,0,height-1); + size_t bvhBytes = 0; + if (time_steps == 1) + bvhBytes = getBVHBytes(range,sizeof(BVH4::AABBNode),0); + else { + bvhBytes = (time_steps-1)*getBVHBytes(range,sizeof(BVH4::AABBNodeMB),0); + bvhBytes += getTemporalBVHBytes(make_range(0,int(time_steps-1)),sizeof(BVH4::AABBNodeMB4D)); + } + const size_t gridBytes = 4*size_t(width)*size_t(height)*sizeof(float); + size_t rootBytes = time_steps*sizeof(BVH4::NodeRef); +#if !defined(__X86_64__) && !defined(__aarch64__) + rootBytes += 4; // We read 2 elements behind the grid. As we store at least 8 root bytes after the grid we are fine in 64 bit mode. But in 32 bit mode we have to do additional padding. +#endif + void* data = alloc(offsetof(GridSOA,data)+bvhBytes+time_steps*gridBytes+rootBytes); + assert(data); + return new (data) GridSOA(patches,time_steps,x0,x1,y0,y1,patches->grid_u_res,patches->grid_v_res,scene->get(patches->geomID()),bvhBytes,gridBytes,bounds_o); + } + + /*! Grid creation */ + template + static GridSOA* create(const SubdivPatch1Base* const patches, const unsigned time_steps, + const Scene* scene, const Allocator& alloc, BBox3fa* bounds_o = nullptr) + { + return create(patches,time_steps,0,patches->grid_u_res-1,0,patches->grid_v_res-1,scene,alloc,bounds_o); + } + + /*! returns reference to root */ + __forceinline BVH4::NodeRef& root(size_t t = 0) { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; } + __forceinline const BVH4::NodeRef& root(size_t t = 0) const { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; } + + /*! returns pointer to BVH array */ + __forceinline int8_t* bvhData() { return &data[0]; } + __forceinline const int8_t* bvhData() const { return &data[0]; } + + /*! returns pointer to Grid array */ + __forceinline float* gridData(size_t t = 0) { return (float*) &data[gridOffset + t*gridBytes]; } + __forceinline const float* gridData(size_t t = 0) const { return (float*) &data[gridOffset + t*gridBytes]; } + + __forceinline void* encodeLeaf(size_t u, size_t v) { + return (void*) (16*(v * width + u + 1)); // +1 to not create empty leaf + } + __forceinline float* decodeLeaf(size_t t, const void* ptr) { + return gridData(t) + (((size_t) (ptr) >> 4) - 1); + } + + /*! returns the size of the BVH over the grid in bytes */ + static size_t getBVHBytes(const GridRange& range, const size_t nodeBytes, const size_t leafBytes); + + /*! returns the size of the temporal BVH over the time range BVHs */ + static size_t getTemporalBVHBytes(const range time_range, const size_t nodeBytes); + + /*! calculates bounding box of grid range */ + __forceinline BBox3fa calculateBounds(size_t time, const GridRange& range) const + { + const float* const grid_array = gridData(time); + const float* const grid_x_array = grid_array + 0 * dim_offset; + const float* const grid_y_array = grid_array + 1 * dim_offset; + const float* const grid_z_array = grid_array + 2 * dim_offset; + + /* compute the bounds just for the range! */ + BBox3fa bounds( empty ); + for (unsigned v = range.v_start; v<=range.v_end; v++) + { + for (unsigned u = range.u_start; u<=range.u_end; u++) + { + const float x = grid_x_array[ v * width + u]; + const float y = grid_y_array[ v * width + u]; + const float z = grid_z_array[ v * width + u]; + bounds.extend( Vec3fa(x,y,z) ); + } + } + assert(is_finite(bounds)); + return bounds; + } + + /*! Evaluates grid over patch and builds BVH4 tree over the grid. */ + std::pair buildBVH(BBox3fa* bounds_o); + + /*! Create BVH4 tree over grid. */ + std::pair buildBVH(const GridRange& range, size_t& allocator); + + /*! Evaluates grid over patch and builds MSMBlur BVH4 tree over the grid. */ + std::pair buildMSMBlurBVH(const range time_range, BBox3fa* bounds_o); + + /*! Create MBlur BVH4 tree over grid. */ + std::pair buildMBlurBVH(size_t time, const GridRange& range, size_t& allocator); + + /*! Create MSMBlur BVH4 tree over grid. */ + std::pair buildMSMBlurBVH(const range time_range, size_t& allocator, BBox3fa* bounds_o); + + template + struct MapUV + { + typedef typename Loader::vfloat vfloat; + const float* const grid_uv; + size_t line_offset; + size_t lines; + + __forceinline MapUV(const float* const grid_uv, size_t line_offset, const size_t lines) + : grid_uv(grid_uv), line_offset(line_offset), lines(lines) {} + + __forceinline void operator() (vfloat& u, vfloat& v) const { + const Vec3 tri_v012_uv = Loader::gather(grid_uv,line_offset,lines); + const Vec2 uv0 = GridSOA::decodeUV(tri_v012_uv[0]); + const Vec2 uv1 = GridSOA::decodeUV(tri_v012_uv[1]); + const Vec2 uv2 = GridSOA::decodeUV(tri_v012_uv[2]); + const Vec2 uv = u * uv1 + v * uv2 + (1.0f-u-v) * uv0; + u = uv[0];v = uv[1]; + } + }; + + struct Gather2x3 + { + enum { M = 4 }; + typedef vbool4 vbool; + typedef vint4 vint; + typedef vfloat4 vfloat; + + static __forceinline const Vec3vf4 gather(const float* const grid, const size_t line_offset, const size_t lines) + { + vfloat4 r0 = vfloat4::loadu(grid + 0*line_offset); + vfloat4 r1 = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid + if (unlikely(line_offset == 2)) + { + r0 = shuffle<0,1,1,1>(r0); + r1 = shuffle<0,1,1,1>(r1); + } + return Vec3vf4(unpacklo(r0,r1), // r00, r10, r01, r11 + shuffle<1,1,2,2>(r0), // r01, r01, r02, r02 + shuffle<0,1,1,2>(r1)); // r10, r11, r11, r12 + } + + static __forceinline void gather(const float* const grid_x, + const float* const grid_y, + const float* const grid_z, + const size_t line_offset, + const size_t lines, + Vec3vf4& v0_o, + Vec3vf4& v1_o, + Vec3vf4& v2_o) + { + const Vec3vf4 tri_v012_x = gather(grid_x,line_offset,lines); + const Vec3vf4 tri_v012_y = gather(grid_y,line_offset,lines); + const Vec3vf4 tri_v012_z = gather(grid_z,line_offset,lines); + v0_o = Vec3vf4(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]); + v1_o = Vec3vf4(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]); + v2_o = Vec3vf4(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]); + } + }; + +#if defined (__AVX__) + struct Gather3x3 + { + enum { M = 8 }; + typedef vbool8 vbool; + typedef vint8 vint; + typedef vfloat8 vfloat; + + static __forceinline const Vec3vf8 gather(const float* const grid, const size_t line_offset, const size_t lines) + { + vfloat4 ra = vfloat4::loadu(grid + 0*line_offset); + vfloat4 rb = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid + vfloat4 rc; + if (likely(lines > 2)) + rc = vfloat4::loadu(grid + 2*line_offset); + else + rc = rb; + + if (unlikely(line_offset == 2)) + { + ra = shuffle<0,1,1,1>(ra); + rb = shuffle<0,1,1,1>(rb); + rc = shuffle<0,1,1,1>(rc); + } + + const vfloat8 r0 = vfloat8(ra,rb); + const vfloat8 r1 = vfloat8(rb,rc); + return Vec3vf8(unpacklo(r0,r1), // r00, r10, r01, r11, r10, r20, r11, r21 + shuffle<1,1,2,2>(r0), // r01, r01, r02, r02, r11, r11, r12, r12 + shuffle<0,1,1,2>(r1)); // r10, r11, r11, r12, r20, r21, r21, r22 + } + + static __forceinline void gather(const float* const grid_x, + const float* const grid_y, + const float* const grid_z, + const size_t line_offset, + const size_t lines, + Vec3vf8& v0_o, + Vec3vf8& v1_o, + Vec3vf8& v2_o) + { + const Vec3vf8 tri_v012_x = gather(grid_x,line_offset,lines); + const Vec3vf8 tri_v012_y = gather(grid_y,line_offset,lines); + const Vec3vf8 tri_v012_z = gather(grid_z,line_offset,lines); + v0_o = Vec3vf8(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]); + v1_o = Vec3vf8(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]); + v2_o = Vec3vf8(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]); + } + }; +#endif + + template + static __forceinline Vec2 decodeUV(const vfloat& uv) + { + typedef typename vfloat::Int vint; + const vint iu = asInt(uv) & 0xffff; + const vint iv = srl(asInt(uv),16); + const vfloat u = (vfloat)iu * vfloat(8.0f/0x10000); + const vfloat v = (vfloat)iv * vfloat(8.0f/0x10000); + return Vec2(u,v); + } + + __forceinline unsigned int geomID() const { + return _geomID; + } + + __forceinline unsigned int primID() const { + return _primID; + } + + public: + BVH4::NodeRef troot; +#if !defined(__X86_64__) && !defined(__aarch64__) + unsigned align1; +#endif + unsigned time_steps; + unsigned width; + + unsigned height; + unsigned dim_offset; + unsigned _geomID; + unsigned _primID; + + unsigned align2; + unsigned gridOffset; + unsigned gridBytes; + unsigned rootOffset; + + int8_t data[1]; //!< after the struct we first store the BVH, then the grid, and finally the roots + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h new file mode 100644 index 00000000000..2ed922a5aeb --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h @@ -0,0 +1,207 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "grid_soa.h" +#include "../common/ray.h" +#include "triangle_intersector_pluecker.h" + +namespace embree +{ + namespace isa + { + class GridSOAIntersector1 + { + public: + typedef void Primitive; + + class Precalculations + { + public: + __forceinline Precalculations (const Ray& ray, const void* ptr) + : grid(nullptr) {} + + public: + GridSOA* grid; + int itime; + float ftime; + }; + + template + static __forceinline void intersect(RayHit& ray, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t dim_offset = pre.grid->dim_offset; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + Vec3 v0, v1, v2; + Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2); + GridSOA::MapUV mapUV(grid_uv,line_offset,lines); + PlueckerIntersector1 intersector(ray,nullptr); + intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU(ray,context,pre.grid->geomID(),pre.grid->primID())); + }; + + template + static __forceinline bool occluded(Ray& ray, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t dim_offset = pre.grid->dim_offset; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + Vec3 v0, v1, v2; + Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2); + + GridSOA::MapUV mapUV(grid_uv,line_offset,lines); + PlueckerIntersector1 intersector(ray,nullptr); + return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU(ray,context,pre.grid->geomID(),pre.grid->primID())); + } + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(0,prim); + +#if defined(__AVX__) + intersect( ray, context, grid_x, line_offset, lines, pre); +#else + intersect(ray, context, grid_x , line_offset, lines, pre); + if (likely(lines > 2)) + intersect(ray, context, grid_x+line_offset, line_offset, lines, pre); +#endif + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(0,prim); + +#if defined(__AVX__) + return occluded( ray, context, grid_x, line_offset, lines, pre); +#else + if (occluded(ray, context, grid_x , line_offset, lines, pre)) return true; + if (likely(lines > 2)) + if (occluded(ray, context, grid_x+line_offset, line_offset, lines, pre)) return true; +#endif + return false; + } + }; + + class GridSOAMBIntersector1 + { + public: + typedef void Primitive; + typedef GridSOAIntersector1::Precalculations Precalculations; + + template + static __forceinline void intersect(RayHit& ray, const float ftime, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t dim_offset = pre.grid->dim_offset; + const size_t grid_offset = pre.grid->gridBytes >> 2; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + Vec3 a0, a1, a2; + Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2); + + Vec3 b0, b1, b2; + Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2); + + Vec3 v0 = lerp(a0,b0,vfloat(ftime)); + Vec3 v1 = lerp(a1,b1,vfloat(ftime)); + Vec3 v2 = lerp(a2,b2,vfloat(ftime)); + + GridSOA::MapUV mapUV(grid_uv,line_offset,lines); + PlueckerIntersector1 intersector(ray,nullptr); + intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU(ray,context,pre.grid->geomID(),pre.grid->primID())); + }; + + template + static __forceinline bool occluded(Ray& ray, const float ftime, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t dim_offset = pre.grid->dim_offset; + const size_t grid_offset = pre.grid->gridBytes >> 2; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + Vec3 a0, a1, a2; + Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2); + + Vec3 b0, b1, b2; + Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2); + + Vec3 v0 = lerp(a0,b0,vfloat(ftime)); + Vec3 v1 = lerp(a1,b1,vfloat(ftime)); + Vec3 v2 = lerp(a2,b2,vfloat(ftime)); + + GridSOA::MapUV mapUV(grid_uv,line_offset,lines); + PlueckerIntersector1 intersector(ray,nullptr); + return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU(ray,context,pre.grid->geomID(),pre.grid->primID())); + } + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(pre.itime,prim); + +#if defined(__AVX__) + intersect( ray, pre.ftime, context, grid_x, line_offset, lines, pre); +#else + intersect(ray, pre.ftime, context, grid_x, line_offset, lines, pre); + if (likely(lines > 2)) + intersect(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre); +#endif + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(pre.itime,prim); + +#if defined(__AVX__) + return occluded( ray, pre.ftime, context, grid_x, line_offset, lines, pre); +#else + if (occluded(ray, pre.ftime, context, grid_x , line_offset, lines, pre)) return true; + if (likely(lines > 2)) + if (occluded(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true; +#endif + return false; + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h new file mode 100644 index 00000000000..41d66e1e285 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h @@ -0,0 +1,445 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "grid_soa.h" +#include "../common/ray.h" +#include "triangle_intersector_pluecker.h" + +namespace embree +{ + namespace isa + { + template + struct MapUV0 + { + const float* const grid_uv; + size_t ofs00, ofs01, ofs10, ofs11; + + __forceinline MapUV0(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11) + : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {} + + __forceinline void operator() (vfloat& u, vfloat& v) const { + const vfloat uv00(grid_uv[ofs00]); + const vfloat uv01(grid_uv[ofs01]); + const vfloat uv10(grid_uv[ofs10]); + const vfloat uv11(grid_uv[ofs11]); + const Vec2vf uv0 = GridSOA::decodeUV(uv00); + const Vec2vf uv1 = GridSOA::decodeUV(uv01); + const Vec2vf uv2 = GridSOA::decodeUV(uv10); + const Vec2vf uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0)); + u = uv[0]; v = uv[1]; + } + }; + + template + struct MapUV1 + { + const float* const grid_uv; + size_t ofs00, ofs01, ofs10, ofs11; + + __forceinline MapUV1(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11) + : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {} + + __forceinline void operator() (vfloat& u, vfloat& v) const { + const vfloat uv00(grid_uv[ofs00]); + const vfloat uv01(grid_uv[ofs01]); + const vfloat uv10(grid_uv[ofs10]); + const vfloat uv11(grid_uv[ofs11]); + const Vec2vf uv0 = GridSOA::decodeUV(uv10); + const Vec2vf uv1 = GridSOA::decodeUV(uv01); + const Vec2vf uv2 = GridSOA::decodeUV(uv11); + const Vec2vf uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0)); + u = uv[0]; v = uv[1]; + } + }; + + template + class GridSOAIntersectorK + { + public: + typedef void Primitive; + + class Precalculations + { +#if defined(__AVX__) + static const int M = 8; +#else + static const int M = 4; +#endif + + public: + __forceinline Precalculations (const vbool& valid, const RayK& ray) + : grid(nullptr), intersector(valid,ray) {} + + public: + GridSOA* grid; + PlueckerIntersectorK intersector; // FIXME: use quad intersector + }; + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t dim_offset = pre.grid->dim_offset; + const size_t line_offset = pre.grid->width; + const float* const grid_x = pre.grid->decodeLeaf(0,prim); + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + const size_t max_x = pre.grid->width == 2 ? 1 : 2; + const size_t max_y = pre.grid->height == 2 ? 1 : 2; + for (size_t y=0; y p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); + const Vec3vf p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); + const Vec3vf p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); + const Vec3vf p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); + + pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); + pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); + } + } + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t dim_offset = pre.grid->dim_offset; + const size_t line_offset = pre.grid->width; + const float* const grid_x = pre.grid->decodeLeaf(0,prim); + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + vbool valid = valid_i; + const size_t max_x = pre.grid->width == 2 ? 1 : 2; + const size_t max_y = pre.grid->height == 2 ? 1 : 2; + for (size_t y=0; y p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); + const Vec3vf p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); + const Vec3vf p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); + const Vec3vf p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); + + pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID())); + if (none(valid)) break; + pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID())); + if (none(valid)) break; + } + } + return !valid; + } + + template + static __forceinline void intersect(RayHitK& ray, size_t k, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t dim_offset = pre.grid->dim_offset; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + Vec3 v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2); + pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV(grid_uv,line_offset,lines),Intersect1KEpilogMU(ray,k,context,pre.grid->geomID(),pre.grid->primID())); + }; + + template + static __forceinline bool occluded(RayK& ray, size_t k, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t dim_offset = pre.grid->dim_offset; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + Vec3 v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2); + return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV(grid_uv,line_offset,lines),Occluded1KEpilogMU(ray,k,context,pre.grid->geomID(),pre.grid->primID())); + } + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(0,prim); +#if defined(__AVX__) + intersect( ray, k, context, grid_x, line_offset, lines, pre); +#else + intersect(ray, k, context, grid_x , line_offset, lines, pre); + if (likely(lines > 2)) + intersect(ray, k, context, grid_x+line_offset, line_offset, lines, pre); +#endif + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(0,prim); + +#if defined(__AVX__) + return occluded( ray, k, context, grid_x, line_offset, lines, pre); +#else + if (occluded(ray, k, context, grid_x , line_offset, lines, pre)) return true; + if (likely(lines > 2)) + if (occluded(ray, k, context, grid_x+line_offset, line_offset, lines, pre)) return true; +#endif + return false; + } + }; + + template + class GridSOAMBIntersectorK + { + public: + typedef void Primitive; + typedef typename GridSOAIntersectorK::Precalculations Precalculations; + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + vfloat vftime; + vint vitime = getTimeSegment(ray.time(), vfloat((float)(pre.grid->time_steps-1)), vftime); + + vbool valid1 = valid_i; + while (any(valid1)) { + const size_t j = bsf(movemask(valid1)); + const int itime = vitime[j]; + const vbool valid2 = valid1 & (itime == vitime); + valid1 = valid1 & !valid2; + intersect(valid2,pre,ray,vftime,itime,context,prim,lazy_node); + } + } + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, const vfloat& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t grid_offset = pre.grid->gridBytes >> 2; + const size_t dim_offset = pre.grid->dim_offset; + const size_t line_offset = pre.grid->width; + const float* const grid_x = pre.grid->decodeLeaf(itime,prim); + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + const size_t max_x = pre.grid->width == 2 ? 1 : 2; + const size_t max_y = pre.grid->height == 2 ? 1 : 2; + for (size_t y=0; y a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); + const Vec3vf a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); + const Vec3vf a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); + const Vec3vf a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); + ofs00 += grid_offset; + ofs01 += grid_offset; + ofs10 += grid_offset; + ofs11 += grid_offset; + const Vec3vf b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); + const Vec3vf b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); + const Vec3vf b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); + const Vec3vf b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); + const Vec3vf p00 = lerp(a00,b00,ftime); + const Vec3vf p01 = lerp(a01,b01,ftime); + const Vec3vf p10 = lerp(a10,b10,ftime); + const Vec3vf p11 = lerp(a11,b11,ftime); + + pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); + pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID())); + } + } + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + vfloat vftime; + vint vitime = getTimeSegment(ray.time(), vfloat((float)(pre.grid->time_steps-1)), vftime); + + vbool valid_o = valid_i; + vbool valid1 = valid_i; + while (any(valid1)) { + const int j = int(bsf(movemask(valid1))); + const int itime = vitime[j]; + const vbool valid2 = valid1 & (itime == vitime); + valid1 = valid1 & !valid2; + valid_o &= !valid2 | occluded(valid2,pre,ray,vftime,itime,context,prim,lazy_node); + } + return !valid_o; + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, const vfloat& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + const size_t grid_offset = pre.grid->gridBytes >> 2; + const size_t dim_offset = pre.grid->dim_offset; + const size_t line_offset = pre.grid->width; + const float* const grid_x = pre.grid->decodeLeaf(itime,prim); + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + vbool valid = valid_i; + const size_t max_x = pre.grid->width == 2 ? 1 : 2; + const size_t max_y = pre.grid->height == 2 ? 1 : 2; + for (size_t y=0; y a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); + const Vec3vf a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); + const Vec3vf a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); + const Vec3vf a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); + ofs00 += grid_offset; + ofs01 += grid_offset; + ofs10 += grid_offset; + ofs11 += grid_offset; + const Vec3vf b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]); + const Vec3vf b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]); + const Vec3vf b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]); + const Vec3vf b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]); + const Vec3vf p00 = lerp(a00,b00,ftime); + const Vec3vf p01 = lerp(a01,b01,ftime); + const Vec3vf p10 = lerp(a10,b10,ftime); + const Vec3vf p11 = lerp(a11,b11,ftime); + + pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID())); + if (none(valid)) break; + pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID())); + if (none(valid)) break; + } + } + return valid; + } + + template + static __forceinline void intersect(RayHitK& ray, size_t k, + const float ftime, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t grid_offset = pre.grid->gridBytes >> 2; + const size_t dim_offset = pre.grid->dim_offset; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + Vec3 a0, a1, a2; + Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2); + + Vec3 b0, b1, b2; + Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2); + + Vec3 v0 = lerp(a0,b0,vfloat(ftime)); + Vec3 v1 = lerp(a1,b1,vfloat(ftime)); + Vec3 v2 = lerp(a2,b2,vfloat(ftime)); + + pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV(grid_uv,line_offset,lines),Intersect1KEpilogMU(ray,k,context,pre.grid->geomID(),pre.grid->primID())); + }; + + template + static __forceinline bool occluded(RayK& ray, size_t k, + const float ftime, + IntersectContext* context, + const float* const grid_x, + const size_t line_offset, + const size_t lines, + Precalculations& pre) + { + typedef typename Loader::vfloat vfloat; + const size_t grid_offset = pre.grid->gridBytes >> 2; + const size_t dim_offset = pre.grid->dim_offset; + const float* const grid_y = grid_x + 1 * dim_offset; + const float* const grid_z = grid_x + 2 * dim_offset; + const float* const grid_uv = grid_x + 3 * dim_offset; + + Vec3 a0, a1, a2; + Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2); + + Vec3 b0, b1, b2; + Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2); + + Vec3 v0 = lerp(a0,b0,vfloat(ftime)); + Vec3 v1 = lerp(a1,b1,vfloat(ftime)); + Vec3 v2 = lerp(a2,b2,vfloat(ftime)); + + return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV(grid_uv,line_offset,lines),Occluded1KEpilogMU(ray,k,context,pre.grid->geomID(),pre.grid->primID())); + } + + /*! Intersect a ray with the primitive. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + float ftime; + int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime); + + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(itime,prim); + +#if defined(__AVX__) + intersect( ray, k, ftime, context, grid_x, line_offset, lines, pre); +#else + intersect(ray, k, ftime, context, grid_x, line_offset, lines, pre); + if (likely(lines > 2)) + intersect(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre); +#endif + } + + /*! Test if the ray is occluded by the primitive */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + float ftime; + int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime); + + const size_t line_offset = pre.grid->width; + const size_t lines = pre.grid->height; + const float* const grid_x = pre.grid->decodeLeaf(itime,prim); + +#if defined(__AVX__) + return occluded( ray, k, ftime, context, grid_x, line_offset, lines, pre); +#else + if (occluded(ray, k, ftime, context, grid_x, line_offset, lines, pre)) return true; + if (likely(lines > 2)) + if (occluded(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true; +#endif + return false; + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/instance.h b/thirdparty/embree-aarch64/kernels/geometry/instance.h new file mode 100644 index 00000000000..66893d581f0 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/instance.h @@ -0,0 +1,78 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" +#include "../common/scene_instance.h" + +namespace embree +{ + struct InstancePrimitive + { + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* primitive supports multiple time segments */ + static const bool singleTimeSegment = false; + + /* Returns maximum number of stored primitives */ + static __forceinline size_t max_size() { return 1; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return N; } + + public: + + InstancePrimitive (const Instance* instance, unsigned int instID) + : instance(instance) + , instID_(instID) + {} + + __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene) + { + assert(end-i == 1); + const PrimRef& prim = prims[i]; i++; + const unsigned int geomID = prim.geomID(); + const Instance* instance = scene->get(geomID); + new (this) InstancePrimitive(instance, geomID); + } + + __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime) + { + assert(end-i == 1); + const PrimRef& prim = prims[i]; i++; + const unsigned int geomID = prim.geomID(); + const Instance* instance = scene->get(geomID); + new (this) InstancePrimitive(instance,geomID); + return instance->linearBounds(0,itime); + } + + __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range) + { + assert(end-i == 1); + const PrimRefMB& prim = prims[i]; i++; + const unsigned int geomID = prim.geomID(); + const Instance* instance = scene->get(geomID); + new (this) InstancePrimitive(instance,geomID); + return instance->linearBounds(0,time_range); + } + + /* Updates the primitive */ + __forceinline BBox3fa update(Instance* instance) { + return instance->bounds(0); + } + + public: + const Instance* instance; + const unsigned int instID_ = std::numeric_limits::max (); + }; +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h new file mode 100644 index 00000000000..91731a39c5a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h @@ -0,0 +1,84 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "instance.h" +#include "../common/ray.h" +#include "../common/point_query.h" + +namespace embree +{ + namespace isa + { + struct InstanceIntersector1 + { + typedef InstancePrimitive Primitive; + + struct Precalculations { + __forceinline Precalculations (const Ray& ray, const void *ptr) {} + }; + + static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim); + static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim); + static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim); + }; + + struct InstanceIntersector1MB + { + typedef InstancePrimitive Primitive; + + struct Precalculations { + __forceinline Precalculations (const Ray& ray, const void *ptr) {} + }; + + static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim); + static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim); + static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim); + }; + + template + struct InstanceIntersectorK + { + typedef InstancePrimitive Primitive; + + struct Precalculations { + __forceinline Precalculations (const vbool& valid, const RayK& ray) {} + }; + + static void intersect(const vbool& valid_i, const Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive& prim); + static vbool occluded(const vbool& valid_i, const Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive& prim); + + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& prim) { + intersect(vbool(1<& ray, size_t k, IntersectContext* context, const Primitive& prim) { + occluded(vbool(1< + struct InstanceIntersectorKMB + { + typedef InstancePrimitive Primitive; + + struct Precalculations { + __forceinline Precalculations (const vbool& valid, const RayK& ray) {} + }; + + static void intersect(const vbool& valid_i, const Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive& prim); + static vbool occluded(const vbool& valid_i, const Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive& prim); + + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& prim) { + intersect(vbool(1<& ray, size_t k, IntersectContext* context, const Primitive& prim) { + occluded(vbool(1< + struct UVIdentity { + __forceinline void operator() (vfloat& u, vfloat& v) const {} + }; + + + template + struct Intersect1Epilog1 + { + RayHit& ray; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Intersect1Epilog1(RayHit& ray, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), context(context), geomID(geomID), primID(primID) {} + + template + __forceinline bool operator() (Hit& hit) const + { + /* ray mask test */ + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + if ((geometry->mask & ray.mask) == 0) return false; +#endif + hit.finalize(); + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { + HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng); + const float old_t = ray.tfar; + ray.tfar = hit.t; + bool found = runIntersectionFilter1(geometry,ray,context,h); + if (!found) ray.tfar = old_t; + return found; + } + } +#endif + + /* update hit information */ + ray.tfar = hit.t; + ray.Ng = hit.Ng; + ray.u = hit.u; + ray.v = hit.v; + ray.primID = primID; + ray.geomID = geomID; + instance_id_stack::copy(context->user->instID, ray.instID); + return true; + } + }; + + template + struct Occluded1Epilog1 + { + Ray& ray; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Occluded1Epilog1(Ray& ray, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), context(context), geomID(geomID), primID(primID) {} + + template + __forceinline bool operator() (Hit& hit) const + { + /* ray mask test */ + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); + + +#if defined(EMBREE_RAY_MASK) + if ((geometry->mask & ray.mask) == 0) return false; +#endif + hit.finalize(); + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) { + HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng); + const float old_t = ray.tfar; + ray.tfar = hit.t; + const bool found = runOcclusionFilter1(geometry,ray,context,h); + if (!found) ray.tfar = old_t; + return found; + } + } +#endif + return true; + } + }; + + template + struct Intersect1KEpilog1 + { + RayHitK& ray; + size_t k; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Intersect1KEpilog1(RayHitK& ray, size_t k, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {} + + template + __forceinline bool operator() (Hit& hit) const + { + /* ray mask test */ + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + if ((geometry->mask & ray.mask[k]) == 0) + return false; +#endif + hit.finalize(); + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { + HitK h(context->user,geomID,primID,hit.u,hit.v,hit.Ng); + const float old_t = ray.tfar[k]; + ray.tfar[k] = hit.t; + const bool found = any(runIntersectionFilter(vbool(1<*, const size_t&>(context->user->instID, ray.instID, k); + return true; + } + }; + + template + struct Occluded1KEpilog1 + { + RayK& ray; + size_t k; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Occluded1KEpilog1(RayK& ray, size_t k, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {} + + template + __forceinline bool operator() (Hit& hit) const + { + /* ray mask test */ + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + if ((geometry->mask & ray.mask[k]) == 0) + return false; +#endif + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) { + hit.finalize(); + HitK h(context->user,geomID,primID,hit.u,hit.v,hit.Ng); + const float old_t = ray.tfar[k]; + ray.tfar[k] = hit.t; + const bool found = any(runOcclusionFilter(vbool(1< + struct Intersect1EpilogM + { + RayHit& ray; + IntersectContext* context; + const vuint& geomIDs; + const vuint& primIDs; + + __forceinline Intersect1EpilogM(RayHit& ray, + IntersectContext* context, + const vuint& geomIDs, + const vuint& primIDs) + : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {} + + template + __forceinline bool operator() (const vbool& valid_i, Hit& hit) const + { + Scene* scene MAYBE_UNUSED = context->scene; + vbool valid = valid_i; + if (Mx > M) valid &= (1<get(geomID); + +#if defined(EMBREE_RAY_MASK) + /* goto next hit if mask test fails */ + if ((geometry->mask & ray.mask) == 0) { + clear(valid,i); + continue; + } +#endif + +#if defined(EMBREE_FILTER_FUNCTION) + /* call intersection filter function */ + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { + const Vec2f uv = hit.uv(i); + HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i)); + const float old_t = ray.tfar; + ray.tfar = hit.t(i); + const bool found = runIntersectionFilter1(geometry,ray,context,h); + if (!found) ray.tfar = old_t; + foundhit |= found; + clear(valid,i); + valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value + continue; + } + } +#endif + break; + } +#endif + + /* update hit information */ + const Vec2f uv = hit.uv(i); + ray.tfar = hit.vt[i]; + ray.Ng.x = hit.vNg.x[i]; + ray.Ng.y = hit.vNg.y[i]; + ray.Ng.z = hit.vNg.z[i]; + ray.u = uv.x; + ray.v = uv.y; + ray.primID = primIDs[i]; + ray.geomID = geomID; + instance_id_stack::copy(context->user->instID, ray.instID); + return true; + + } + }; + +#if 0 && defined(__AVX512F__) // do not enable, this reduced frequency for BVH4 + template + struct Intersect1EpilogM + { + static const size_t Mx = 16; + RayHit& ray; + IntersectContext* context; + const vuint& geomIDs; + const vuint& primIDs; + + __forceinline Intersect1EpilogM(RayHit& ray, + IntersectContext* context, + const vuint& geomIDs, + const vuint& primIDs) + : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {} + + template + __forceinline bool operator() (const vbool& valid_i, Hit& hit) const + { + Scene* MAYBE_UNUSED scene = context->scene; + vbool valid = valid_i; + if (Mx > M) valid &= (1<get(geomID); + +#if defined(EMBREE_RAY_MASK) + /* goto next hit if mask test fails */ + if ((geometry->mask & ray.mask) == 0) { + clear(valid,i); + continue; + } +#endif + +#if defined(EMBREE_FILTER_FUNCTION) + /* call intersection filter function */ + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { + const Vec2f uv = hit.uv(i); + HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i)); + const float old_t = ray.tfar; + ray.tfar = hit.t(i); + const bool found = runIntersectionFilter1(geometry,ray,context,h); + if (!found) ray.tfar = old_t; + foundhit |= found; + clear(valid,i); + valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value + continue; + } + } +#endif + break; + } +#endif + + vbool finalMask(((unsigned int)1 << i)); + ray.update(finalMask,hit.vt,hit.vu,hit.vv,hit.vNg.x,hit.vNg.y,hit.vNg.z,geomID,primIDs); + instance_id_stack::foreach([&](unsigned level) + { + ray.instID[level] = context->user->instID[level]; + return (context->user->instID[level] != RTC_INVALID_GEOMETRY_ID); + }); + return true; + + } + }; +#endif + + template + struct Occluded1EpilogM + { + Ray& ray; + IntersectContext* context; + const vuint& geomIDs; + const vuint& primIDs; + + __forceinline Occluded1EpilogM(Ray& ray, + IntersectContext* context, + const vuint& geomIDs, + const vuint& primIDs) + : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {} + + template + __forceinline bool operator() (const vbool& valid_i, Hit& hit) const + { + Scene* scene MAYBE_UNUSED = context->scene; + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK) + if (unlikely(filter)) + hit.finalize(); /* called only once */ + + vbool valid = valid_i; + if (Mx > M) valid &= (1<get(geomID); + +#if defined(EMBREE_RAY_MASK) + /* goto next hit if mask test fails */ + if ((geometry->mask & ray.mask) == 0) { + m=btc(m,i); + continue; + } +#endif + +#if defined(EMBREE_FILTER_FUNCTION) + /* if we have no filter then the test passed */ + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) + { + const Vec2f uv = hit.uv(i); + HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i)); + const float old_t = ray.tfar; + ray.tfar = hit.t(i); + if (runOcclusionFilter1(geometry,ray,context,h)) return true; + ray.tfar = old_t; + m=btc(m,i); + continue; + } + } +#endif + break; + } +#endif + + return true; + } + }; + + template + struct Intersect1EpilogMU + { + RayHit& ray; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Intersect1EpilogMU(RayHit& ray, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), context(context), geomID(geomID), primID(primID) {} + + template + __forceinline bool operator() (const vbool& valid_i, Hit& hit) const + { + /* ray mask test */ + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + if ((geometry->mask & ray.mask) == 0) return false; +#endif + + vbool valid = valid_i; + hit.finalize(); + + size_t i = select_min(valid,hit.vt); + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) + { + bool foundhit = false; + while (true) + { + /* call intersection filter function */ + Vec2f uv = hit.uv(i); + const float old_t = ray.tfar; + ray.tfar = hit.t(i); + HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i)); + const bool found = runIntersectionFilter1(geometry,ray,context,h); + if (!found) ray.tfar = old_t; + foundhit |= found; + clear(valid,i); + valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value + if (unlikely(none(valid))) break; + i = select_min(valid,hit.vt); + } + return foundhit; + } +#endif + + /* update hit information */ + const Vec2f uv = hit.uv(i); + const Vec3fa Ng = hit.Ng(i); + ray.tfar = hit.t(i); + ray.Ng.x = Ng.x; + ray.Ng.y = Ng.y; + ray.Ng.z = Ng.z; + ray.u = uv.x; + ray.v = uv.y; + ray.primID = primID; + ray.geomID = geomID; + instance_id_stack::copy(context->user->instID, ray.instID); + return true; + } + }; + + template + struct Occluded1EpilogMU + { + Ray& ray; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Occluded1EpilogMU(Ray& ray, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), context(context), geomID(geomID), primID(primID) {} + + template + __forceinline bool operator() (const vbool& valid, Hit& hit) const + { + /* ray mask test */ + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + if ((geometry->mask & ray.mask) == 0) return false; +#endif + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) + { + hit.finalize(); + for (size_t m=movemask(valid), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) + { + const Vec2f uv = hit.uv(i); + const float old_t = ray.tfar; + ray.tfar = hit.t(i); + HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i)); + if (runOcclusionFilter1(geometry,ray,context,h)) return true; + ray.tfar = old_t; + } + return false; + } +#endif + return true; + } + }; + + template + struct IntersectKEpilogM + { + RayHitK& ray; + IntersectContext* context; + const vuint& geomIDs; + const vuint& primIDs; + const size_t i; + + __forceinline IntersectKEpilogM(RayHitK& ray, + IntersectContext* context, + const vuint& geomIDs, + const vuint& primIDs, + size_t i) + : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {} + + template + __forceinline vbool operator() (const vbool& valid_i, const Hit& hit) const + { + Scene* scene MAYBE_UNUSED = context->scene; + + vfloat u, v, t; + Vec3vf Ng; + vbool valid = valid_i; + + std::tie(u,v,t,Ng) = hit(); + + const unsigned int geomID = geomIDs[i]; + const unsigned int primID = primIDs[i]; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); + + /* ray masking test */ +#if defined(EMBREE_RAY_MASK) + valid &= (geometry->mask & ray.mask) != 0; + if (unlikely(none(valid))) return false; +#endif + + /* occlusion filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { + HitK h(context->user,geomID,primID,u,v,Ng); + const vfloat old_t = ray.tfar; + ray.tfar = select(valid,t,ray.tfar); + const vbool m_accept = runIntersectionFilter(valid,geometry,ray,context,h); + ray.tfar = select(m_accept,ray.tfar,old_t); + return m_accept; + } + } +#endif + + /* update hit information */ + vfloat::store(valid,&ray.tfar,t); + vfloat::store(valid,&ray.Ng.x,Ng.x); + vfloat::store(valid,&ray.Ng.y,Ng.y); + vfloat::store(valid,&ray.Ng.z,Ng.z); + vfloat::store(valid,&ray.u,u); + vfloat::store(valid,&ray.v,v); + vuint::store(valid,&ray.primID,primID); + vuint::store(valid,&ray.geomID,geomID); + instance_id_stack::copy*, const vbool&>(context->user->instID, ray.instID, valid); + return valid; + } + }; + + template + struct OccludedKEpilogM + { + vbool& valid0; + RayK& ray; + IntersectContext* context; + const vuint& geomIDs; + const vuint& primIDs; + const size_t i; + + __forceinline OccludedKEpilogM(vbool& valid0, + RayK& ray, + IntersectContext* context, + const vuint& geomIDs, + const vuint& primIDs, + size_t i) + : valid0(valid0), ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {} + + template + __forceinline vbool operator() (const vbool& valid_i, const Hit& hit) const + { + vbool valid = valid_i; + + /* ray masking test */ + Scene* scene MAYBE_UNUSED = context->scene; + const unsigned int geomID = geomIDs[i]; + const unsigned int primID = primIDs[i]; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + valid &= (geometry->mask & ray.mask) != 0; + if (unlikely(none(valid))) return valid; +#endif + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) + { + vfloat u, v, t; + Vec3vf Ng; + std::tie(u,v,t,Ng) = hit(); + HitK h(context->user,geomID,primID,u,v,Ng); + const vfloat old_t = ray.tfar; + ray.tfar = select(valid,t,ray.tfar); + valid = runOcclusionFilter(valid,geometry,ray,context,h); + ray.tfar = select(valid,ray.tfar,old_t); + } + } +#endif + + /* update occlusion */ + valid0 = valid0 & !valid; + return valid; + } + }; + + template + struct IntersectKEpilogMU + { + RayHitK& ray; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline IntersectKEpilogMU(RayHitK& ray, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), context(context), geomID(geomID), primID(primID) {} + + template + __forceinline vbool operator() (const vbool& valid_org, const Hit& hit) const + { + vbool valid = valid_org; + vfloat u, v, t; + Vec3vf Ng; + std::tie(u,v,t,Ng) = hit(); + + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); + + /* ray masking test */ +#if defined(EMBREE_RAY_MASK) + valid &= (geometry->mask & ray.mask) != 0; + if (unlikely(none(valid))) return false; +#endif + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { + HitK h(context->user,geomID,primID,u,v,Ng); + const vfloat old_t = ray.tfar; + ray.tfar = select(valid,t,ray.tfar); + const vbool m_accept = runIntersectionFilter(valid,geometry,ray,context,h); + ray.tfar = select(m_accept,ray.tfar,old_t); + return m_accept; + } + } +#endif + + /* update hit information */ + vfloat::store(valid,&ray.tfar,t); + vfloat::store(valid,&ray.Ng.x,Ng.x); + vfloat::store(valid,&ray.Ng.y,Ng.y); + vfloat::store(valid,&ray.Ng.z,Ng.z); + vfloat::store(valid,&ray.u,u); + vfloat::store(valid,&ray.v,v); + vuint::store(valid,&ray.primID,primID); + vuint::store(valid,&ray.geomID,geomID); + instance_id_stack::copy*, const vbool&>(context->user->instID, ray.instID, valid); + + return valid; + } + }; + + template + struct OccludedKEpilogMU + { + vbool& valid0; + RayK& ray; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline OccludedKEpilogMU(vbool& valid0, + RayK& ray, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : valid0(valid0), ray(ray), context(context), geomID(geomID), primID(primID) {} + + template + __forceinline vbool operator() (const vbool& valid_i, const Hit& hit) const + { + vbool valid = valid_i; + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); + +#if defined(EMBREE_RAY_MASK) + valid &= (geometry->mask & ray.mask) != 0; + if (unlikely(none(valid))) return false; +#endif + + /* occlusion filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) + { + vfloat u, v, t; + Vec3vf Ng; + std::tie(u,v,t,Ng) = hit(); + HitK h(context->user,geomID,primID,u,v,Ng); + const vfloat old_t = ray.tfar; + ray.tfar = select(valid,t,ray.tfar); + valid = runOcclusionFilter(valid,geometry,ray,context,h); + ray.tfar = select(valid,ray.tfar,old_t); + } + } +#endif + + /* update occlusion */ + valid0 = valid0 & !valid; + return valid; + } + }; + + template + struct Intersect1KEpilogM + { + RayHitK& ray; + size_t k; + IntersectContext* context; + const vuint& geomIDs; + const vuint& primIDs; + + __forceinline Intersect1KEpilogM(RayHitK& ray, size_t k, + IntersectContext* context, + const vuint& geomIDs, + const vuint& primIDs) + : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {} + + template + __forceinline bool operator() (const vbool& valid_i, Hit& hit) const + { + Scene* scene MAYBE_UNUSED = context->scene; + vbool valid = valid_i; + hit.finalize(); + if (Mx > M) valid &= (1<get(geomID); + +#if defined(EMBREE_RAY_MASK) + /* goto next hit if mask test fails */ + if ((geometry->mask & ray.mask[k]) == 0) { + clear(valid,i); + continue; + } +#endif + +#if defined(EMBREE_FILTER_FUNCTION) + /* call intersection filter function */ + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) { + assert(i h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i)); + const float old_t = ray.tfar[k]; + ray.tfar[k] = hit.t(i); + const bool found = any(runIntersectionFilter(vbool(1<(hit.vNg.x),vfloat(hit.vNg.y),vfloat(hit.vNg.z),geomID,vuint(primIDs)); +#else + const Vec2f uv = hit.uv(i); + ray.tfar[k] = hit.t(i); + ray.Ng.x[k] = hit.vNg.x[i]; + ray.Ng.y[k] = hit.vNg.y[i]; + ray.Ng.z[k] = hit.vNg.z[i]; + ray.u[k] = uv.x; + ray.v[k] = uv.y; + ray.primID[k] = primIDs[i]; + ray.geomID[k] = geomID; + instance_id_stack::copy*, const size_t&>(context->user->instID, ray.instID, k); +#endif + return true; + } + }; + + template + struct Occluded1KEpilogM + { + RayK& ray; + size_t k; + IntersectContext* context; + const vuint& geomIDs; + const vuint& primIDs; + + __forceinline Occluded1KEpilogM(RayK& ray, size_t k, + IntersectContext* context, + const vuint& geomIDs, + const vuint& primIDs) + : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {} + + template + __forceinline bool operator() (const vbool& valid_i, Hit& hit) const + { + Scene* scene MAYBE_UNUSED = context->scene; + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK) + if (unlikely(filter)) + hit.finalize(); /* called only once */ + + vbool valid = valid_i; + if (Mx > M) valid &= (1<get(geomID); + +#if defined(EMBREE_RAY_MASK) + /* goto next hit if mask test fails */ + if ((geometry->mask & ray.mask[k]) == 0) { + m=btc(m,i); + continue; + } +#endif + +#if defined(EMBREE_FILTER_FUNCTION) + /* execute occlusion filer */ + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) + { + const Vec2f uv = hit.uv(i); + const float old_t = ray.tfar[k]; + ray.tfar[k] = hit.t(i); + HitK h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i)); + if (any(runOcclusionFilter(vbool(1< + struct Intersect1KEpilogMU + { + RayHitK& ray; + size_t k; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Intersect1KEpilogMU(RayHitK& ray, size_t k, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {} + + template + __forceinline bool operator() (const vbool& valid_i, Hit& hit) const + { + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + /* ray mask test */ + if ((geometry->mask & ray.mask[k]) == 0) + return false; +#endif + + /* finalize hit calculation */ + vbool valid = valid_i; + hit.finalize(); + size_t i = select_min(valid,hit.vt); + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) + { + bool foundhit = false; + while (true) + { + const Vec2f uv = hit.uv(i); + const float old_t = ray.tfar[k]; + ray.tfar[k] = hit.t(i); + HitK h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i)); + const bool found = any(runIntersectionFilter(vbool(1<(Ng.x),vfloat(Ng.y),vfloat(Ng.z),geomID,vuint(primID)); +#else + const Vec2f uv = hit.uv(i); + const Vec3fa Ng = hit.Ng(i); + ray.tfar[k] = hit.t(i); + ray.Ng.x[k] = Ng.x; + ray.Ng.y[k] = Ng.y; + ray.Ng.z[k] = Ng.z; + ray.u[k] = uv.x; + ray.v[k] = uv.y; + ray.primID[k] = primID; + ray.geomID[k] = geomID; + instance_id_stack::copy*, const size_t&>(context->user->instID, ray.instID, k); +#endif + return true; + } + }; + + template + struct Occluded1KEpilogMU + { + RayK& ray; + size_t k; + IntersectContext* context; + const unsigned int geomID; + const unsigned int primID; + + __forceinline Occluded1KEpilogMU(RayK& ray, size_t k, + IntersectContext* context, + const unsigned int geomID, + const unsigned int primID) + : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {} + + template + __forceinline bool operator() (const vbool& valid_i, Hit& hit) const + { + Scene* scene MAYBE_UNUSED = context->scene; + Geometry* geometry MAYBE_UNUSED = scene->get(geomID); +#if defined(EMBREE_RAY_MASK) + /* ray mask test */ + if ((geometry->mask & ray.mask[k]) == 0) + return false; +#endif + + /* intersection filter test */ +#if defined(EMBREE_FILTER_FUNCTION) + if (filter) { + if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) + { + hit.finalize(); + for (size_t m=movemask(valid_i), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) + { + const Vec2f uv = hit.uv(i); + const float old_t = ray.tfar[k]; + ray.tfar[k] = hit.t(i); + HitK h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i)); + if (any(runOcclusionFilter(vbool(1< + struct ArrayIntersector1 + { + typedef typename Intersector::Primitive Primitive; + typedef typename Intersector::Precalculations Precalculations; + + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + for (size_t i=0; i + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + for (size_t i=0; i + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery &tquery, size_t& lazy_node) + { + bool changed = false; + for (size_t i=0; i + static __forceinline void intersectK(const vbool& valid, /* PrecalculationsK& pre, */ RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + } + + template + static __forceinline vbool occludedK(const vbool& valid, /* PrecalculationsK& pre, */ RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + return valid; + } + }; + + template + struct ArrayIntersectorK_1 + { + typedef typename Intersector::Primitive Primitive; + typedef typename Intersector::Precalculations Precalculations; + + template + static __forceinline void intersect(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) + { + for (size_t i=0; i + static __forceinline vbool occluded(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) + { + vbool valid0 = valid; + for (size_t i=0; i + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + for (size_t i=0; i + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + for (size_t i=0; i + struct ArrayIntersectorKStream + { + typedef typename IntersectorK::Primitive PrimitiveK; + typedef typename IntersectorK::Precalculations PrecalculationsK; + + static __forceinline void intersectK(const vbool& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayHitK& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) + { + PrecalculationsK pre(valid,ray); // FIXME: might cause trouble + + for (size_t i=0; i occludedK(const vbool& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayK& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) + { + PrecalculationsK pre(valid,ray); // FIXME: might cause trouble + vbool valid0 = valid; + for (size_t i=0; i& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) + { + PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble + for (size_t i=0; i& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) + { + PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble + for (size_t i=0; i** __restrict__ inputPackets, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node) + { + size_t m_occluded = 0; + for (size_t i=0; i &ray = *inputPackets[rayID / K]; + const size_t k = rayID % K; + PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble + if (IntersectorK::occluded(pre,ray,k,context,prim[i])) + { + m_occluded |= (size_t)1 << rayID; + ray.tfar[k] = neg_inf; + } + } + } + return m_occluded; + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h new file mode 100644 index 00000000000..eef5b0b1fd1 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h @@ -0,0 +1,141 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + namespace isa + { + template + struct LineIntersectorHitM + { + __forceinline LineIntersectorHitM() {} + + __forceinline LineIntersectorHitM(const vfloat& u, const vfloat& v, const vfloat& t, const Vec3vf& Ng) + : vu(u), vv(v), vt(t), vNg(Ng) {} + + __forceinline void finalize() {} + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + public: + vfloat vu; + vfloat vv; + vfloat vt; + Vec3vf vNg; + }; + + template + struct FlatLinearCurveIntersector1 + { + typedef CurvePrecalculations1 Precalculations; + + template + static __forceinline bool intersect(const vbool& valid_i, + Ray& ray, + IntersectContext* context, + const LineSegments* geom, + const Precalculations& pre, + const Vec4vf& v0i, const Vec4vf& v1i, + const Epilog& epilog) + { + /* transform end points into ray space */ + vbool valid = valid_i; + vfloat depth_scale = pre.depth_scale; + LinearSpace3> ray_space = pre.ray_space; + + const Vec3vf ray_org ((Vec3fa)ray.org); + const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); + const Vec4vf v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i); + + Vec4vf p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w); + Vec4vf p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w); + + /* approximative intersection with cone */ + const Vec4vf v = p1-p0; + const Vec4vf w = -p0; + const vfloat d0 = madd(w.x,v.x,w.y*v.y); + const vfloat d1 = madd(v.x,v.x,v.y*v.y); + const vfloat u = clamp(d0*rcp(d1),vfloat(zero),vfloat(one)); + const Vec4vf p = madd(u,v,p0); + const vfloat t = p.z; + const vfloat d2 = madd(p.x,p.x,p.y*p.y); + const vfloat r = p.w; + const vfloat r2 = r*r; + valid &= (d2 <= r2) & (vfloat(ray.tnear()) <= t) & (t <= vfloat(ray.tfar)); + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) + valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections + if (unlikely(none(valid))) return false; + + /* ignore denormalized segments */ + const Vec3vf T = v1.xyz()-v0.xyz(); + valid &= (T.x != vfloat(zero)) | (T.y != vfloat(zero)) | (T.z != vfloat(zero)); + if (unlikely(none(valid))) return false; + + /* update hit information */ + LineIntersectorHitM hit(u,zero,t,T); + return epilog(valid,hit); + } + }; + + template + struct FlatLinearCurveIntersectorK + { + typedef CurvePrecalculationsK Precalculations; + + template + static __forceinline bool intersect(const vbool& valid_i, + RayK& ray, size_t k, + IntersectContext* context, + const LineSegments* geom, + const Precalculations& pre, + const Vec4vf& v0i, const Vec4vf& v1i, + const Epilog& epilog) + { + /* transform end points into ray space */ + vbool valid = valid_i; + vfloat depth_scale = pre.depth_scale[k]; + LinearSpace3> ray_space = pre.ray_space[k]; + const Vec3vf ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]); + const Vec3vf ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]); + + const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); + const Vec4vf v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i); + + Vec4vf p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w); + Vec4vf p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w); + + /* approximative intersection with cone */ + const Vec4vf v = p1-p0; + const Vec4vf w = -p0; + const vfloat d0 = madd(w.x,v.x,w.y*v.y); + const vfloat d1 = madd(v.x,v.x,v.y*v.y); + const vfloat u = clamp(d0*rcp(d1),vfloat(zero),vfloat(one)); + const Vec4vf p = madd(u,v,p0); + const vfloat t = p.z; + const vfloat d2 = madd(p.x,p.x,p.y*p.y); + const vfloat r = p.w; + const vfloat r2 = r*r; + valid &= (d2 <= r2) & (vfloat(ray.tnear()[k]) <= t) & (t <= vfloat(ray.tfar[k])); + if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) + valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections + if (unlikely(none(valid))) return false; + + /* ignore denormalized segments */ + const Vec3vf T = v1.xyz()-v0.xyz(); + valid &= (T.x != vfloat(zero)) | (T.y != vfloat(zero)) | (T.z != vfloat(zero)); + if (unlikely(none(valid))) return false; + + /* update hit information */ + LineIntersectorHitM hit(u,zero,t,T); + return epilog(valid,hit); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/linei.h b/thirdparty/embree-aarch64/kernels/geometry/linei.h new file mode 100644 index 00000000000..a72029ca53d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/linei.h @@ -0,0 +1,709 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" + +namespace embree +{ + template + struct LineMi + { + /* Virtual interface to query information about the line segment type */ + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* primitive supports multiple time segments */ + static const bool singleTimeSegment = false; + + /* Returns maximum number of stored line segments */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N line segments */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + /* Returns required number of bytes for N line segments */ + static __forceinline size_t bytes(size_t N) { return blocks(N)*sizeof(LineMi); } + + public: + + /* Default constructor */ + __forceinline LineMi() { } + + /* Construction from vertices and IDs */ + __forceinline LineMi(const vuint& v0, unsigned short leftExists, unsigned short rightExists, const vuint& geomIDs, const vuint& primIDs, Geometry::GType gtype) + : gtype((unsigned char)gtype), m((unsigned char)popcnt(vuint(primIDs) != vuint(-1))), sharedGeomID(geomIDs[0]), leftExists (leftExists), rightExists(rightExists), v0(v0), primIDs(primIDs) + { + assert(all(vuint(geomID()) == geomIDs)); + } + + /* Returns a mask that tells which line segments are valid */ + __forceinline vbool valid() const { return primIDs != vuint(-1); } + + /* Returns a mask that tells which line segments are valid */ + template + __forceinline vbool valid() const { return vuint(primIDs) != vuint(-1); } + + /* Returns if the specified line segment is valid */ + __forceinline bool valid(const size_t i) const { assert(i + //static __forceinline T unmask(T &index) { return index & 0x3fffffff; } + + __forceinline unsigned int geomID(unsigned int i = 0) const { return sharedGeomID; } + //__forceinline vuint geomID() { return unmask(geomIDs); } + //__forceinline const vuint geomID() const { return unmask(geomIDs); } + //__forceinline unsigned int geomID(const size_t i) const { assert(i& primID() { return primIDs; } + __forceinline const vuint& primID() const { return primIDs; } + __forceinline unsigned int primID(const size_t i) const { assert(i& p0, + Vec4vf& p1, + const LineSegments* geom) const; + + __forceinline void gatheri(Vec4vf& p0, + Vec4vf& p1, + const LineSegments* geom, + const int itime) const; + + __forceinline void gather(Vec4vf& p0, + Vec4vf& p1, + const LineSegments* geom, + float time) const; + + /* gather the line segments with lateral info */ + __forceinline void gather(Vec4vf& p0, + Vec4vf& p1, + Vec4vf& pL, + Vec4vf& pR, + const LineSegments* geom) const; + + __forceinline void gatheri(Vec4vf& p0, + Vec4vf& p1, + Vec4vf& pL, + Vec4vf& pR, + const LineSegments* geom, + const int itime) const; + + __forceinline void gather(Vec4vf& p0, + Vec4vf& p1, + Vec4vf& pL, + Vec4vf& pR, + const LineSegments* geom, + float time) const; + + __forceinline void gather(Vec4vf& p0, + Vec4vf& p1, + vbool& cL, + vbool& cR, + const LineSegments* geom) const; + + __forceinline void gatheri(Vec4vf& p0, + Vec4vf& p1, + vbool& cL, + vbool& cR, + const LineSegments* geom, + const int itime) const; + + __forceinline void gather(Vec4vf& p0, + Vec4vf& p1, + vbool& cL, + vbool& cR, + const LineSegments* geom, + float time) const; + + /* Calculate the bounds of the line segments */ + __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const + { + BBox3fa bounds = empty; + for (size_t i=0; iget(geomID(i)); + const Vec3ff& p0 = geom->vertex(v0[i]+0,itime); + const Vec3ff& p1 = geom->vertex(v0[i]+1,itime); + BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1)); + b = enlarge(b,Vec3fa(max(p0.w,p1.w))); + bounds.extend(b); + } + return bounds; + } + + /* Calculate the linear bounds of the primitive */ + __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) { + return LBBox3fa(bounds(scene,itime+0), bounds(scene,itime+1)); + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) { + LBBox3fa allBounds = empty; + for (size_t i=0; iget(geomID(i)); + allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps)); + } + return allBounds; + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) + { + LBBox3fa allBounds = empty; + for (size_t i=0; iget(geomID((unsigned int)i)); + allBounds.extend(geom->linearBounds(primID(i), time_range)); + } + return allBounds; + } + + /* Fill line segment from line segment list */ + template + __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene) + { + Geometry::GType gty = scene->get(prims[begin].geomID())->getType(); + vuint geomID, primID; + vuint v0; + unsigned short leftExists = 0; + unsigned short rightExists = 0; + const PrimRefT* prim = &prims[begin]; + + for (size_t i=0; iget(prim->geomID()); + if (begingeomID(); + primID[i] = prim->primID(); + v0[i] = geom->segment(prim->primID()); + leftExists |= geom->segmentLeftExists(primID[i]) << i; + rightExists |= geom->segmentRightExists(primID[i]) << i; + begin++; + } else { + assert(i); + if (i>0) { + geomID[i] = geomID[i-1]; + primID[i] = -1; + v0[i] = v0[i-1]; + } + } + if (begin + __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range& set, const Allocator& alloc) + { + size_t start = set.begin(); + size_t items = LineMi::blocks(set.size()); + size_t numbytes = LineMi::bytes(set.size()); + LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float)); + for (size_t i=0; iscene); + } + return bvh->encodeLeaf((char*)accel,items); + }; + + __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime) + { + fill(prims,begin,end,scene); + return linearBounds(scene,itime); + } + + __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range) + { + fill(prims,begin,end,scene); + return linearBounds(scene,time_range); + } + + template + __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc) + { + size_t start = prims.begin(); + size_t end = prims.end(); + size_t items = LineMi::blocks(prims.size()); + size_t numbytes = LineMi::bytes(prims.size()); + LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float)); + const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,items); + + LBBox3fa bounds = empty; + for (size_t i=0; idata(),start,end,bvh->scene,prims.time_range)); + + return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range); + }; + + /* Updates the primitive */ + __forceinline BBox3fa update(LineSegments* geom) + { + BBox3fa bounds = empty; + for (size_t i=0; ivertex(v0[i]+0); + const Vec3ff& p1 = geom->vertex(v0[i]+1); + BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1)); + b = enlarge(b,Vec3fa(max(p0.w,p1.w))); + bounds.extend(b); + } + return bounds; + } + + /*! output operator */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const LineMi& line) { + return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}"; + } + + public: + unsigned char gtype; + unsigned char m; + unsigned int sharedGeomID; + unsigned short leftExists, rightExists; + vuint v0; // index of start vertex + private: + vuint primIDs; // primitive ID + }; + + template<> + __forceinline void LineMi<4>::gather(Vec4vf4& p0, + Vec4vf4& p1, + const LineSegments* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0])); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1])); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2])); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3])); + transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1)); + transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w); + } + + template<> + __forceinline void LineMi<4>::gatheri(Vec4vf4& p0, + Vec4vf4& p1, + const LineSegments* geom, + const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime)); + transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime)); + transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w); + } + + template<> + __forceinline void LineMi<4>::gather(Vec4vf4& p0, + Vec4vf4& p1, + const LineSegments* geom, + float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf4 a0,a1; + gatheri(a0,a1,geom,itime); + Vec4vf4 b0,b1; + gatheri(b0,b1,geom,itime+1); + p0 = lerp(a0,b0,vfloat4(ftime)); + p1 = lerp(a1,b1,vfloat4(ftime)); + } + + template<> + __forceinline void LineMi<4>::gather(Vec4vf4& p0, + Vec4vf4& p1, + vbool4& cL, + vbool4& cR, + const LineSegments* geom) const + { + gather(p0,p1,geom); + cL = !vbool4(leftExists); + cR = !vbool4(rightExists); + } + + template<> + __forceinline void LineMi<4>::gatheri(Vec4vf4& p0, + Vec4vf4& p1, + vbool4& cL, + vbool4& cR, + const LineSegments* geom, + const int itime) const + { + gatheri(p0,p1,geom,itime); + cL = !vbool4(leftExists); + cR = !vbool4(rightExists); + } + + template<> + __forceinline void LineMi<4>::gather(Vec4vf4& p0, + Vec4vf4& p1, + vbool4& cL, + vbool4& cR, + const LineSegments* geom, + float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf4 a0,a1; + gatheri(a0,a1,geom,itime); + Vec4vf4 b0,b1; + gatheri(b0,b1,geom,itime+1); + p0 = lerp(a0,b0,vfloat4(ftime)); + p1 = lerp(a1,b1,vfloat4(ftime)); + cL = !vbool4(leftExists); + cR = !vbool4(rightExists); + } + + template<> + __forceinline void LineMi<4>::gather(Vec4vf4& p0, + Vec4vf4& p1, + Vec4vf4& pL, + Vec4vf4& pR, + const LineSegments* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0])); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1])); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2])); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3])); + transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1)); + transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w); + + const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf); + const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf); + const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf); + const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf); + transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w); + + const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf); + const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf); + const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf); + const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf); + transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w); + } + + template<> + __forceinline void LineMi<4>::gatheri(Vec4vf4& p0, + Vec4vf4& p1, + Vec4vf4& pL, + Vec4vf4& pR, + const LineSegments* geom, + const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime)); + transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime)); + transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w); + + const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf); + const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf); + const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf); + const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf); + transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w); + + const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf); + const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf); + const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf); + const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf); + transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w); + } + + template<> + __forceinline void LineMi<4>::gather(Vec4vf4& p0, + Vec4vf4& p1, + Vec4vf4& pL, + Vec4vf4& pR, + const LineSegments* geom, + float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf4 a0,a1,aL,aR; + gatheri(a0,a1,aL,aR,geom,itime); + Vec4vf4 b0,b1,bL,bR; + gatheri(b0,b1,bL,bR,geom,itime+1); + p0 = lerp(a0,b0,vfloat4(ftime)); + p1 = lerp(a1,b1,vfloat4(ftime)); + pL = lerp(aL,bL,vfloat4(ftime)); + pR = lerp(aR,bR,vfloat4(ftime)); + } + +#if defined(__AVX__) + + template<> + __forceinline void LineMi<8>::gather(Vec4vf8& p0, + Vec4vf8& p1, + const LineSegments* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0])); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1])); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2])); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3])); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4])); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5])); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6])); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7])); + transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1)); + const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1)); + const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1)); + const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1)); + const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1)); + transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w); + } + + template<> + __forceinline void LineMi<8>::gatheri(Vec4vf8& p0, + Vec4vf8& p1, + const LineSegments* geom, + const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime)); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime)); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime)); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime)); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime)); + transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime)); + const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime)); + const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime)); + const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime)); + const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime)); + transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w); + } + + template<> + __forceinline void LineMi<8>::gather(Vec4vf8& p0, + Vec4vf8& p1, + const LineSegments* geom, + float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf8 a0,a1; + gatheri(a0,a1,geom,itime); + Vec4vf8 b0,b1; + gatheri(b0,b1,geom,itime+1); + p0 = lerp(a0,b0,vfloat8(ftime)); + p1 = lerp(a1,b1,vfloat8(ftime)); + } + + template<> + __forceinline void LineMi<8>::gather(Vec4vf8& p0, + Vec4vf8& p1, + Vec4vf8& pL, + Vec4vf8& pR, + const LineSegments* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0])); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1])); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2])); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3])); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4])); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5])); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6])); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7])); + transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1)); + const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1)); + const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1)); + const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1)); + const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1)); + transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w); + + const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf); + const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf); + const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf); + const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf); + const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1)) : vfloat4(inf); + const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1)) : vfloat4(inf); + const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1)) : vfloat4(inf); + const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1)) : vfloat4(inf); + transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w); + + const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf); + const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf); + const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf); + const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf); + const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2)) : vfloat4(inf); + const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2)) : vfloat4(inf); + const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2)) : vfloat4(inf); + const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2)) : vfloat4(inf); + transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w); + } + + template<> + __forceinline void LineMi<8>::gatheri(Vec4vf8& p0, + Vec4vf8& p1, + Vec4vf8& pL, + Vec4vf8& pR, + const LineSegments* geom, + const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime)); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime)); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime)); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime)); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime)); + transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w); + + const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime)); + const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime)); + const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime)); + const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime)); + const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime)); + const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime)); + const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime)); + const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime)); + transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w); + + const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf); + const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf); + const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf); + const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf); + const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1,itime)) : vfloat4(inf); + const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1,itime)) : vfloat4(inf); + const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1,itime)) : vfloat4(inf); + const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1,itime)) : vfloat4(inf); + transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w); + + const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf); + const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf); + const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf); + const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf); + const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2,itime)) : vfloat4(inf); + const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2,itime)) : vfloat4(inf); + const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2,itime)) : vfloat4(inf); + const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2,itime)) : vfloat4(inf); + transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w); + } + + template<> + __forceinline void LineMi<8>::gather(Vec4vf8& p0, + Vec4vf8& p1, + Vec4vf8& pL, + Vec4vf8& pR, + const LineSegments* geom, + float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf8 a0,a1,aL,aR; + gatheri(a0,a1,aL,aR,geom,itime); + Vec4vf8 b0,b1,bL,bR; + gatheri(b0,b1,bL,bR,geom,itime+1); + p0 = lerp(a0,b0,vfloat8(ftime)); + p1 = lerp(a1,b1,vfloat8(ftime)); + pL = lerp(aL,bL,vfloat8(ftime)); + pR = lerp(aR,bR,vfloat8(ftime)); + } + + template<> + __forceinline void LineMi<8>::gather(Vec4vf8& p0, + Vec4vf8& p1, + vbool8& cL, + vbool8& cR, + const LineSegments* geom) const + { + gather(p0,p1,geom); + cL = !vbool8(leftExists); + cR = !vbool8(rightExists); + } + + template<> + __forceinline void LineMi<8>::gatheri(Vec4vf8& p0, + Vec4vf8& p1, + vbool8& cL, + vbool8& cR, + const LineSegments* geom, + const int itime) const + { + gatheri(p0,p1,geom,itime); + cL = !vbool8(leftExists); + cR = !vbool8(rightExists); + } + + template<> + __forceinline void LineMi<8>::gather(Vec4vf8& p0, + Vec4vf8& p1, + vbool8& cL, + vbool8& cR, + const LineSegments* geom, + float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf8 a0,a1; + gatheri(a0,a1,geom,itime); + Vec4vf8 b0,b1; + gatheri(b0,b1,geom,itime+1); + p0 = lerp(a0,b0,vfloat8(ftime)); + p1 = lerp(a1,b1,vfloat8(ftime)); + cL = !vbool8(leftExists); + cR = !vbool8(rightExists); + } + +#endif + + template + typename LineMi::Type LineMi::type; + + typedef LineMi<4> Line4i; + typedef LineMi<8> Line8i; +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h new file mode 100644 index 00000000000..a431796a88d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h @@ -0,0 +1,124 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "linei.h" +#include "line_intersector.h" +#include "intersector_epilog.h" + +namespace embree +{ + namespace isa + { + template + struct FlatLinearCurveMiIntersector1 + { + typedef LineMi Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; line.gather(v0,v1,geom); + const vbool valid = line.template valid(); + FlatLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; line.gather(v0,v1,geom); + const vbool valid = line.template valid(); + return FlatLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) + { + return PrimitivePointQuery1::pointQuery(query, context, line); + } + }; + + template + struct FlatLinearCurveMiMBIntersector1 + { + typedef LineMi Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; line.gather(v0,v1,geom,ray.time()); + const vbool valid = line.template valid(); + FlatLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; line.gather(v0,v1,geom,ray.time()); + const vbool valid = line.template valid(); + return FlatLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) + { + return PrimitivePointQuery1::pointQuery(query, context, line); + } + }; + + template + struct FlatLinearCurveMiIntersectorK + { + typedef LineMi Primitive; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; line.gather(v0,v1,geom); + const vbool valid = line.template valid(); + FlatLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM(ray,k,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; line.gather(v0,v1,geom); + const vbool valid = line.template valid(); + return FlatLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM(ray,k,context,line.geomID(),line.primID())); + } + }; + + template + struct FlatLinearCurveMiMBIntersectorK + { + typedef LineMi Primitive; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; line.gather(v0,v1,geom,ray.time()[k]); + const vbool valid = line.template valid(); + FlatLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM(ray,k,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1; line.gather(v0,v1,geom,ray.time()[k]); + const vbool valid = line.template valid(); + return FlatLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM(ray,k,context,line.geomID(),line.primID())); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/object.h b/thirdparty/embree-aarch64/kernels/geometry/object.h new file mode 100644 index 00000000000..f26391de523 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/object.h @@ -0,0 +1,84 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" + +namespace embree +{ + struct Object + { + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* primitive supports multiple time segments */ + static const bool singleTimeSegment = false; + + /* Returns maximum number of stored primitives */ + static __forceinline size_t max_size() { return 1; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return N; } + + public: + + /*! constructs a virtual object */ + Object (unsigned geomID, unsigned primID) + : _geomID(geomID), _primID(primID) {} + + __forceinline unsigned geomID() const { + return _geomID; + } + + __forceinline unsigned primID() const { + return _primID; + } + + /*! fill triangle from triangle list */ + __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene) + { + const PrimRef& prim = prims[i]; i++; + new (this) Object(prim.geomID(), prim.primID()); + } + + /*! fill triangle from triangle list */ + __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime) + { + const PrimRef& prim = prims[i]; i++; + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + new (this) Object(geomID, primID); + AccelSet* accel = (AccelSet*) scene->get(geomID); + return accel->linearBounds(primID,itime); + } + + /*! fill triangle from triangle list */ + __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range) + { + const PrimRefMB& prim = prims[i]; i++; + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + new (this) Object(geomID, primID); + AccelSet* accel = (AccelSet*) scene->get(geomID); + return accel->linearBounds(primID,time_range); + } + + /* Updates the primitive */ + __forceinline BBox3fa update(AccelSet* mesh) { + return mesh->bounds(primID()); + } + + private: + unsigned int _geomID; //!< geometry ID + unsigned int _primID; //!< primitive ID + }; +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h new file mode 100644 index 00000000000..97882e0e599 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h @@ -0,0 +1,127 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "object.h" +#include "../common/ray.h" + +namespace embree +{ + namespace isa + { + template + struct ObjectIntersector1 + { + typedef Object Primitive; + + static const bool validIntersectorK = false; + + struct Precalculations { + __forceinline Precalculations() {} + __forceinline Precalculations (const Ray& ray, const void *ptr) {} + }; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) + { + AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID()); + + /* perform ray mask test */ +#if defined(EMBREE_RAY_MASK) + if ((ray.mask & accel->mask) == 0) + return; +#endif + + accel->intersect(ray,prim.geomID(),prim.primID(),context,reportIntersection1); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) + { + AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID()); + /* perform ray mask test */ +#if defined(EMBREE_RAY_MASK) + if ((ray.mask & accel->mask) == 0) + return false; +#endif + + accel->occluded(ray,prim.geomID(),prim.primID(),context,&reportOcclusion1); + return ray.tfar < 0.0f; + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim) + { + AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID()); + context->geomID = prim.geomID(); + context->primID = prim.primID(); + return accel->pointQuery(query, context); + } + + template + static __forceinline void intersectK(const vbool& valid, /* PrecalculationsK& pre, */ RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + assert(false); + } + + template + static __forceinline vbool occludedK(const vbool& valid, /* PrecalculationsK& pre, */ RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node) + { + assert(false); + return valid; + } + }; + + template + struct ObjectIntersectorK + { + typedef Object Primitive; + + struct Precalculations { + __forceinline Precalculations (const vbool& valid, const RayK& ray) {} + }; + + static __forceinline void intersect(const vbool& valid_i, const Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive& prim) + { + vbool valid = valid_i; + AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID()); + + /* perform ray mask test */ +#if defined(EMBREE_RAY_MASK) + valid &= (ray.mask & accel->mask) != 0; + if (none(valid)) return; +#endif + accel->intersect(valid,ray,prim.geomID(),prim.primID(),context,&reportIntersection1); + } + + static __forceinline vbool occluded(const vbool& valid_i, const Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive& prim) + { + vbool valid = valid_i; + AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID()); + + /* perform ray mask test */ +#if defined(EMBREE_RAY_MASK) + valid &= (ray.mask & accel->mask) != 0; + if (none(valid)) return false; +#endif + accel->occluded(valid,ray,prim.geomID(),prim.primID(),context,&reportOcclusion1); + return ray.tfar < 0.0f; + } + + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& prim) { + intersect(vbool(1<& ray, size_t k, IntersectContext* context, const Primitive& prim) { + occluded(vbool(1< ObjectIntersector4; + typedef ObjectIntersectorK<8,false> ObjectIntersector8; + typedef ObjectIntersectorK<16,false> ObjectIntersector16; + + typedef ObjectIntersectorK<4,true> ObjectIntersector4MB; + typedef ObjectIntersectorK<8,true> ObjectIntersector8MB; + typedef ObjectIntersectorK<16,true> ObjectIntersector16MB; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/plane.h b/thirdparty/embree-aarch64/kernels/geometry/plane.h new file mode 100644 index 00000000000..ebe45db5585 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/plane.h @@ -0,0 +1,57 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" + +namespace embree +{ + namespace isa + { + struct HalfPlane + { + const Vec3fa P; //!< plane origin + const Vec3fa N; //!< plane normal + + __forceinline HalfPlane(const Vec3fa& P, const Vec3fa& N) + : P(P), N(N) {} + + __forceinline BBox1f intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const + { + Vec3fa O = Vec3fa(ray_org) - P; + Vec3fa D = Vec3fa(ray_dir); + float ON = dot(O,N); + float DN = dot(D,N); + bool eps = abs(DN) < min_rcp_input; + float t = -ON*rcp(DN); + float lower = select(eps || DN < 0.0f, float(neg_inf), t); + float upper = select(eps || DN > 0.0f, float(pos_inf), t); + return BBox1f(lower,upper); + } + }; + + template + struct HalfPlaneN + { + const Vec3vf P; //!< plane origin + const Vec3vf N; //!< plane normal + + __forceinline HalfPlaneN(const Vec3vf& P, const Vec3vf& N) + : P(P), N(N) {} + + __forceinline BBox> intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const + { + Vec3vf O = Vec3vf((Vec3fa)ray_org) - P; + Vec3vf D = Vec3vf((Vec3fa)ray_dir); + vfloat ON = dot(O,N); + vfloat DN = dot(D,N); + vbool eps = abs(DN) < min_rcp_input; + vfloat t = -ON*rcp(DN); + vfloat lower = select(eps | DN < 0.0f, vfloat(neg_inf), t); + vfloat upper = select(eps | DN > 0.0f, vfloat(pos_inf), t); + return BBox>(lower,upper); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/pointi.h b/thirdparty/embree-aarch64/kernels/geometry/pointi.h new file mode 100644 index 00000000000..4ba298e86b6 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/pointi.h @@ -0,0 +1,417 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" + +namespace embree +{ + template + struct PointMi + { + /* Virtual interface to query information about the line segment type */ + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + /* primitive supports multiple time segments */ + static const bool singleTimeSegment = false; + + /* Returns maximum number of stored line segments */ + static __forceinline size_t max_size() + { + return M; + } + + /* Returns required number of primitive blocks for N line segments */ + static __forceinline size_t blocks(size_t N) + { + return (N + max_size() - 1) / max_size(); + } + + /* Returns required number of bytes for N line segments */ + static __forceinline size_t bytes(size_t N) + { + return blocks(N) * sizeof(PointMi); + } + + public: + /* Default constructor */ + __forceinline PointMi() {} + + /* Construction from vertices and IDs */ + __forceinline PointMi(const vuint& geomIDs, const vuint& primIDs, Geometry::GType gtype, uint32_t numPrimitives) + : gtype((unsigned char)gtype), + numPrimitives(numPrimitives), + sharedGeomID(geomIDs[0]), + primIDs(primIDs) + { + assert(all(vuint(geomID()) == geomIDs)); + } + + /* Returns a mask that tells which line segments are valid */ + __forceinline vbool valid() const { + return vint(step) < vint(numPrimitives); + } + + /* Returns a mask that tells which line segments are valid */ + template __forceinline vbool valid() const { + return vint(step) < vint(numPrimitives); + } + + /* Returns if the specified line segment is valid */ + __forceinline bool valid(const size_t i) const + { + assert(i < M); + return i < numPrimitives; + } + + /* Returns the number of stored line segments */ + __forceinline size_t size() const { + return numPrimitives; + } + + __forceinline unsigned int geomID(unsigned int i = 0) const { + return sharedGeomID; + } + + __forceinline vuint& primID() { + return primIDs; + } + __forceinline const vuint& primID() const { + return primIDs; + } + __forceinline unsigned int primID(const size_t i) const { + assert(i < M); + return primIDs[i]; + } + + /* gather the line segments */ + __forceinline void gather(Vec4vf& p0, const Points* geom) const; + __forceinline void gather(Vec4vf& p0, Vec3vf& n0, const Points* geom) const; + + __forceinline void gatheri(Vec4vf& p0, const Points* geom, const int itime) const; + __forceinline void gatheri(Vec4vf& p0, Vec3vf& n0, const Points* geom, const int itime) const; + + __forceinline void gather(Vec4vf& p0, const Points* geom, float time) const; + __forceinline void gather(Vec4vf& p0, Vec3vf& n0, const Points* geom, float time) const; + + /* Calculate the bounds of the line segments */ + __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const + { + BBox3fa bounds = empty; + for (size_t i = 0; i < M && valid(i); i++) { + const Points* geom = scene->get(geomID(i)); + bounds.extend(geom->bounds(primID(i),itime)); + } + return bounds; + } + + /* Calculate the linear bounds of the primitive */ + __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) { + return LBBox3fa(bounds(scene, itime + 0), bounds(scene, itime + 1)); + } + + __forceinline LBBox3fa linearBounds(const Scene* const scene, size_t itime, size_t numTimeSteps) + { + LBBox3fa allBounds = empty; + for (size_t i = 0; i < M && valid(i); i++) { + const Points* geom = scene->get(geomID(i)); + allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps)); + } + return allBounds; + } + + __forceinline LBBox3fa linearBounds(const Scene* const scene, const BBox1f time_range) + { + LBBox3fa allBounds = empty; + for (size_t i = 0; i < M && valid(i); i++) { + const Points* geom = scene->get(geomID((unsigned int)i)); + allBounds.extend(geom->linearBounds(primID(i), time_range)); + } + return allBounds; + } + + /* Fill line segment from line segment list */ + template + __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene) + { + Geometry::GType gty = scene->get(prims[begin].geomID())->getType(); + vuint geomID, primID; + vuint v0; + const PrimRefT* prim = &prims[begin]; + + int numPrimitives = 0; + for (size_t i = 0; i < M; i++) { + if (begin < end) { + geomID[i] = prim->geomID(); + primID[i] = prim->primID(); + begin++; + numPrimitives++; + } else { + assert(i); + if (i > 0) { + geomID[i] = geomID[i - 1]; + primID[i] = primID[i - 1]; + } + } + if (begin < end) + prim = &prims[begin]; // FIXME: remove this line + } + new (this) PointMi(geomID, primID, gty, numPrimitives); // FIXME: use non temporal store + } + + template + __forceinline static typename BVH::NodeRef createLeaf(BVH* bvh, + const PrimRef* prims, + const range& set, + const Allocator& alloc) + { + size_t start = set.begin(); + size_t items = PointMi::blocks(set.size()); + size_t numbytes = PointMi::bytes(set.size()); + PointMi* accel = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float)); + for (size_t i = 0; i < items; i++) { + accel[i].fill(prims, start, set.end(), bvh->scene); + } + return bvh->encodeLeaf((char*)accel, items); + }; + + __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime) + { + fill(prims, begin, end, scene); + return linearBounds(scene, itime); + } + + __forceinline LBBox3fa fillMB( + const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range) + { + fill(prims, begin, end, scene); + return linearBounds(scene, time_range); + } + + template + __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc) + { + size_t start = prims.object_range.begin(); + size_t end = prims.object_range.end(); + size_t items = PointMi::blocks(prims.object_range.size()); + size_t numbytes = PointMi::bytes(prims.object_range.size()); + PointMi* accel = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float)); + const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel, items); + + LBBox3fa bounds = empty; + for (size_t i = 0; i < items; i++) + bounds.extend(accel[i].fillMB(prims.prims->data(), start, end, bvh->scene, prims.time_range)); + + return typename BVH::NodeRecordMB4D(node, bounds, prims.time_range); + }; + + /*! output operator */ + friend __forceinline embree_ostream operator<<(embree_ostream cout, const PointMi& line) + { + return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}"; + } + + public: + unsigned char gtype; + unsigned char numPrimitives; + unsigned int sharedGeomID; + + private: + vuint primIDs; // primitive ID + }; + + template<> + __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0))); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1))); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2))); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3))); + transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w); + } + + template<> + __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0))); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1))); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2))); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3))); + transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w); + const vfloat4 b0 = vfloat4(geom->normal(primID(0))); + const vfloat4 b1 = vfloat4(geom->normal(primID(1))); + const vfloat4 b2 = vfloat4(geom->normal(primID(2))); + const vfloat4 b3 = vfloat4(geom->normal(primID(3))); + transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z); + } + + template<> + __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, const Points* geom, const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime)); + transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w); + } + + template<> + __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime)); + transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w); + const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime)); + const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime)); + const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime)); + const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime)); + transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z); + } + + template<> + __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom, float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf4 a0; gatheri(a0, geom, itime); + Vec4vf4 b0; gatheri(b0, geom, itime + 1); + p0 = lerp(a0, b0, vfloat4(ftime)); + } + + template<> + __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf4 a0, b0; + Vec3vf4 norm0, norm1; + gatheri(a0, norm0, geom, itime); + gatheri(b0, norm1, geom, itime + 1); + p0 = lerp(a0, b0, vfloat4(ftime)); + n0 = lerp(norm0, norm1, vfloat4(ftime)); + } + +#if defined(__AVX__) + + template<> + __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0))); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1))); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2))); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3))); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4))); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5))); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6))); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7))); + transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w); + } + + template<> + __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0))); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1))); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2))); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3))); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4))); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5))); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6))); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7))); + transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w); + const vfloat4 b0 = vfloat4(geom->normal(primID(0))); + const vfloat4 b1 = vfloat4(geom->normal(primID(1))); + const vfloat4 b2 = vfloat4(geom->normal(primID(2))); + const vfloat4 b3 = vfloat4(geom->normal(primID(3))); + const vfloat4 b4 = vfloat4(geom->normal(primID(4))); + const vfloat4 b5 = vfloat4(geom->normal(primID(5))); + const vfloat4 b6 = vfloat4(geom->normal(primID(6))); + const vfloat4 b7 = vfloat4(geom->normal(primID(7))); + transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z); + } + + template<> + __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, const Points* geom, const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime)); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime)); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime)); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime)); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime)); + transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w); + } + + template<> + __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, const int itime) const + { + const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime)); + const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime)); + const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime)); + const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime)); + const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime)); + const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime)); + const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime)); + const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime)); + transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w); + const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime)); + const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime)); + const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime)); + const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime)); + const vfloat4 b4 = vfloat4(geom->normal(primID(4), itime)); + const vfloat4 b5 = vfloat4(geom->normal(primID(5), itime)); + const vfloat4 b6 = vfloat4(geom->normal(primID(6), itime)); + const vfloat4 b7 = vfloat4(geom->normal(primID(7), itime)); + transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z); + } + + template<> + __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom, float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf8 a0; + gatheri(a0, geom, itime); + Vec4vf8 b0; + gatheri(b0, geom, itime + 1); + p0 = lerp(a0, b0, vfloat8(ftime)); + } + + template<> + __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, float time) const + { + float ftime; + const int itime = geom->timeSegment(time, ftime); + + Vec4vf8 a0, b0; + Vec3vf8 norm0, norm1; + gatheri(a0, norm0, geom, itime); + gatheri(b0, norm1, geom, itime + 1); + p0 = lerp(a0, b0, vfloat8(ftime)); + n0 = lerp(norm0, norm1, vfloat8(ftime)); + } +#endif + + template + typename PointMi::Type PointMi::type; + + typedef PointMi<4> Point4i; + typedef PointMi<8> Point8i; + +} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/geometry/primitive.h b/thirdparty/embree-aarch64/kernels/geometry/primitive.h new file mode 100644 index 00000000000..41e5b2b3042 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/primitive.h @@ -0,0 +1,49 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "../common/scene.h" +#include "../../common/simd/simd.h" +#include "../common/primref.h" +#include "../common/primref_mb.h" + +namespace embree +{ + struct PrimitiveType + { + /*! returns name of this primitive type */ + virtual const char* name() const = 0; + + /*! Returns the number of stored active primitives in a block. */ + virtual size_t sizeActive(const char* This) const = 0; + + /*! Returns the number of stored active and inactive primitives in a block. */ + virtual size_t sizeTotal(const char* This) const = 0; + + /*! Returns the number of bytes of block. */ + virtual size_t getBytes(const char* This) const = 0; + }; + + template + struct PrimitivePointQuery1 + { + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim) + { + bool changed = false; + for (size_t i = 0; i < Primitive::max_size(); i++) + { + if (!prim.valid(i)) break; + STAT3(point_query.trav_prims,1,1,1); + AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID(i)); + context->geomID = prim.geomID(i); + context->primID = prim.primID(i); + changed |= accel->pointQuery(query, context); + } + return changed; + } + + static __forceinline void pointQueryNoop(PointQuery* query, PointQueryContext* context, const Primitive& prim) { } + }; +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp b/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp new file mode 100644 index 00000000000..f93574c9c8d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp @@ -0,0 +1,379 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "primitive.h" +#include "curveNv.h" +#include "curveNi.h" +#include "curveNi_mb.h" +#include "linei.h" +#include "triangle.h" +#include "trianglev.h" +#include "trianglev_mb.h" +#include "trianglei.h" +#include "quadv.h" +#include "quadi.h" +#include "subdivpatch1.h" +#include "object.h" +#include "instance.h" +#include "subgrid.h" + +namespace embree +{ + /********************** Curve4v **************************/ + + template<> + const char* Curve4v::Type::name () const { + return "curve4v"; + } + + template<> + size_t Curve4v::Type::sizeActive(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return ((Line4i*)This)->size(); + else + return ((Curve4v*)This)->N; + } + + template<> + size_t Curve4v::Type::sizeTotal(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return 4; + else + return ((Curve4v*)This)->N; + } + + template<> + size_t Curve4v::Type::getBytes(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return Line4i::bytes(sizeActive(This)); + else + return Curve4v::bytes(sizeActive(This)); + } + + /********************** Curve4i **************************/ + + template<> + const char* Curve4i::Type::name () const { + return "curve4i"; + } + + template<> + size_t Curve4i::Type::sizeActive(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return ((Line4i*)This)->size(); + else + return ((Curve4i*)This)->N; + } + + template<> + size_t Curve4i::Type::sizeTotal(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return 4; + else + return ((Curve4i*)This)->N; + } + + template<> + size_t Curve4i::Type::getBytes(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return Line4i::bytes(sizeActive(This)); + else + return Curve4i::bytes(sizeActive(This)); + } + + /********************** Curve4iMB **************************/ + + template<> + const char* Curve4iMB::Type::name () const { + return "curve4imb"; + } + + template<> + size_t Curve4iMB::Type::sizeActive(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return ((Line4i*)This)->size(); + else + return ((Curve4iMB*)This)->N; + } + + template<> + size_t Curve4iMB::Type::sizeTotal(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return 4; + else + return ((Curve4iMB*)This)->N; + } + + template<> + size_t Curve4iMB::Type::getBytes(const char* This) const + { + if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR) + return Line4i::bytes(sizeActive(This)); + else + return Curve4iMB::bytes(sizeActive(This)); + } + + /********************** Line4i **************************/ + + template<> + const char* Line4i::Type::name () const { + return "line4i"; + } + + template<> + size_t Line4i::Type::sizeActive(const char* This) const { + return ((Line4i*)This)->size(); + } + + template<> + size_t Line4i::Type::sizeTotal(const char* This) const { + return 4; + } + + template<> + size_t Line4i::Type::getBytes(const char* This) const { + return sizeof(Line4i); + } + + /********************** Triangle4 **************************/ + + template<> + const char* Triangle4::Type::name () const { + return "triangle4"; + } + + template<> + size_t Triangle4::Type::sizeActive(const char* This) const { + return ((Triangle4*)This)->size(); + } + + template<> + size_t Triangle4::Type::sizeTotal(const char* This) const { + return 4; + } + + template<> + size_t Triangle4::Type::getBytes(const char* This) const { + return sizeof(Triangle4); + } + + /********************** Triangle4v **************************/ + + template<> + const char* Triangle4v::Type::name () const { + return "triangle4v"; + } + + template<> + size_t Triangle4v::Type::sizeActive(const char* This) const { + return ((Triangle4v*)This)->size(); + } + + template<> + size_t Triangle4v::Type::sizeTotal(const char* This) const { + return 4; + } + + template<> + size_t Triangle4v::Type::getBytes(const char* This) const { + return sizeof(Triangle4v); + } + + /********************** Triangle4i **************************/ + + template<> + const char* Triangle4i::Type::name () const { + return "triangle4i"; + } + + template<> + size_t Triangle4i::Type::sizeActive(const char* This) const { + return ((Triangle4i*)This)->size(); + } + + template<> + size_t Triangle4i::Type::sizeTotal(const char* This) const { + return 4; + } + + template<> + size_t Triangle4i::Type::getBytes(const char* This) const { + return sizeof(Triangle4i); + } + + /********************** Triangle4vMB **************************/ + + template<> + const char* Triangle4vMB::Type::name () const { + return "triangle4vmb"; + } + + template<> + size_t Triangle4vMB::Type::sizeActive(const char* This) const { + return ((Triangle4vMB*)This)->size(); + } + + template<> + size_t Triangle4vMB::Type::sizeTotal(const char* This) const { + return 4; + } + + template<> + size_t Triangle4vMB::Type::getBytes(const char* This) const { + return sizeof(Triangle4vMB); + } + + /********************** Quad4v **************************/ + + template<> + const char* Quad4v::Type::name () const { + return "quad4v"; + } + + template<> + size_t Quad4v::Type::sizeActive(const char* This) const { + return ((Quad4v*)This)->size(); + } + + template<> + size_t Quad4v::Type::sizeTotal(const char* This) const { + return 4; + } + + template<> + size_t Quad4v::Type::getBytes(const char* This) const { + return sizeof(Quad4v); + } + + /********************** Quad4i **************************/ + + template<> + const char* Quad4i::Type::name () const { + return "quad4i"; + } + + template<> + size_t Quad4i::Type::sizeActive(const char* This) const { + return ((Quad4i*)This)->size(); + } + + template<> + size_t Quad4i::Type::sizeTotal(const char* This) const { + return 4; + } + + template<> + size_t Quad4i::Type::getBytes(const char* This) const { + return sizeof(Quad4i); + } + + /********************** SubdivPatch1 **************************/ + + const char* SubdivPatch1::Type::name () const { + return "subdivpatch1"; + } + + size_t SubdivPatch1::Type::sizeActive(const char* This) const { + return 1; + } + + size_t SubdivPatch1::Type::sizeTotal(const char* This) const { + return 1; + } + + size_t SubdivPatch1::Type::getBytes(const char* This) const { + return sizeof(SubdivPatch1); + } + + SubdivPatch1::Type SubdivPatch1::type; + + /********************** Virtual Object **************************/ + + const char* Object::Type::name () const { + return "object"; + } + + size_t Object::Type::sizeActive(const char* This) const { + return 1; + } + + size_t Object::Type::sizeTotal(const char* This) const { + return 1; + } + + size_t Object::Type::getBytes(const char* This) const { + return sizeof(Object); + } + + Object::Type Object::type; + + /********************** Instance **************************/ + + const char* InstancePrimitive::Type::name () const { + return "instance"; + } + + size_t InstancePrimitive::Type::sizeActive(const char* This) const { + return 1; + } + + size_t InstancePrimitive::Type::sizeTotal(const char* This) const { + return 1; + } + + size_t InstancePrimitive::Type::getBytes(const char* This) const { + return sizeof(InstancePrimitive); + } + + InstancePrimitive::Type InstancePrimitive::type; + + /********************** SubGrid **************************/ + + const char* SubGrid::Type::name () const { + return "subgrid"; + } + + size_t SubGrid::Type::sizeActive(const char* This) const { + return 1; + } + + size_t SubGrid::Type::sizeTotal(const char* This) const { + return 1; + } + + size_t SubGrid::Type::getBytes(const char* This) const { + return sizeof(SubGrid); + } + + SubGrid::Type SubGrid::type; + + /********************** SubGridQBVH4 **************************/ + + template<> + const char* SubGridQBVH4::Type::name () const { + return "SubGridQBVH4"; + } + + template<> + size_t SubGridQBVH4::Type::sizeActive(const char* This) const { + return 1; + } + + template<> + size_t SubGridQBVH4::Type::sizeTotal(const char* This) const { + return 1; + } + + template<> + size_t SubGridQBVH4::Type::getBytes(const char* This) const { + return sizeof(SubGridQBVH4); + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h new file mode 100644 index 00000000000..57ff4e60e5f --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h @@ -0,0 +1,76 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + namespace isa + { + /*! Intersects a ray with a quad with backface culling + * enabled. The quad v0,v1,v2,v3 is split into two triangles + * v0,v1,v3 and v2,v3,v1. The edge v1,v2 decides which of the two + * triangles gets intersected. */ + template + __forceinline vbool intersect_quad_backface_culling(const vbool& valid0, + const Vec3fa& ray_org, + const Vec3fa& ray_dir, + const float ray_tnear, + const float ray_tfar, + const Vec3vf& quad_v0, + const Vec3vf& quad_v1, + const Vec3vf& quad_v2, + const Vec3vf& quad_v3, + vfloat& u_o, + vfloat& v_o, + vfloat& t_o) + { + /* calculate vertices relative to ray origin */ + vbool valid = valid0; + const Vec3vf O = Vec3vf(ray_org); + const Vec3vf D = Vec3vf(ray_dir); + const Vec3vf va = quad_v0-O; + const Vec3vf vb = quad_v1-O; + const Vec3vf vc = quad_v2-O; + const Vec3vf vd = quad_v3-O; + + const Vec3vf edb = vb-vd; + const vfloat WW = dot(cross(vd,edb),D); + const Vec3vf v0 = select(WW <= 0.0f,va,vc); + const Vec3vf v1 = select(WW <= 0.0f,vb,vd); + const Vec3vf v2 = select(WW <= 0.0f,vd,vb); + + /* calculate edges */ + const Vec3vf e0 = v2-v0; + const Vec3vf e1 = v0-v1; + + /* perform edge tests */ + const vfloat U = dot(cross(v0,e0),D); + const vfloat V = dot(cross(v1,e1),D); + valid &= max(U,V) <= 0.0f; + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf Ng = cross(e1,e0); + const vfloat den = dot(Ng,D); + const vfloat rcpDen = rcp(den); + + /* perform depth test */ + const vfloat t = rcpDen*dot(v0,Ng); + valid &= vfloat(ray_tnear) <= t & t <= vfloat(ray_tfar); + if (unlikely(none(valid))) return false; + + /* avoid division by 0 */ + valid &= den != vfloat(zero); + if (unlikely(none(valid))) return false; + + /* update hit information */ + t_o = t; + u_o = U * rcpDen; + v_o = V * rcpDen; + u_o = select(WW <= 0.0f,u_o,1.0f-u_o); + v_o = select(WW <= 0.0f,v_o,1.0f-v_o); + return valid; + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h new file mode 100644 index 00000000000..74e8c7720c4 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h @@ -0,0 +1,566 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "quadv.h" +#include "triangle_intersector_moeller.h" + +namespace embree +{ + namespace isa + { + template + struct QuadHitM + { + __forceinline QuadHitM() {} + + __forceinline QuadHitM(const vbool& valid, + const vfloat& U, + const vfloat& V, + const vfloat& T, + const vfloat& absDen, + const Vec3vf& Ng, + const vbool& flags) + : U(U), V(V), T(T), absDen(absDen), tri_Ng(Ng), valid(valid), flags(flags) {} + + __forceinline void finalize() + { + const vfloat rcpAbsDen = rcp(absDen); + vt = T * rcpAbsDen; + const vfloat u = min(U * rcpAbsDen,1.0f); + const vfloat v = min(V * rcpAbsDen,1.0f); + const vfloat u1 = vfloat(1.0f) - u; + const vfloat v1 = vfloat(1.0f) - v; +#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING) + vu = select(flags,u1,u); + vv = select(flags,v1,v); + vNg = Vec3vf(tri_Ng.x,tri_Ng.y,tri_Ng.z); +#else + const vfloat flip = select(flags,vfloat(-1.0f),vfloat(1.0f)); + vv = select(flags,u1,v); + vu = select(flags,v1,u); + vNg = Vec3vf(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z); +#endif + } + + __forceinline Vec2f uv(const size_t i) + { + const float u = vu[i]; + const float v = vv[i]; + return Vec2f(u,v); + } + + __forceinline float t(const size_t i) { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + private: + vfloat U; + vfloat V; + vfloat T; + vfloat absDen; + Vec3vf tri_Ng; + + public: + vbool valid; + vfloat vu; + vfloat vv; + vfloat vt; + Vec3vf vNg; + + public: + const vbool flags; + }; + + template + struct QuadHitK + { + __forceinline QuadHitK(const vfloat& U, + const vfloat& V, + const vfloat& T, + const vfloat& absDen, + const Vec3vf& Ng, + const vbool& flags) + : U(U), V(V), T(T), absDen(absDen), flags(flags), tri_Ng(Ng) {} + + __forceinline std::tuple,vfloat,vfloat,Vec3vf> operator() () const + { + const vfloat rcpAbsDen = rcp(absDen); + const vfloat t = T * rcpAbsDen; + const vfloat u0 = min(U * rcpAbsDen,1.0f); + const vfloat v0 = min(V * rcpAbsDen,1.0f); + const vfloat u1 = vfloat(1.0f) - u0; + const vfloat v1 = vfloat(1.0f) - v0; + const vfloat u = select(flags,u1,u0); + const vfloat v = select(flags,v1,v0); + const Vec3vf Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z); + return std::make_tuple(u,v,t,Ng); + } + + private: + const vfloat U; + const vfloat V; + const vfloat T; + const vfloat absDen; + const vbool flags; + const Vec3vf tri_Ng; + }; + + /* ----------------------------- */ + /* -- single ray intersectors -- */ + /* ----------------------------- */ + + + template + struct QuadMIntersector1MoellerTrumbore; + + /*! Intersects M quads with 1 ray */ + template + struct QuadMIntersector1MoellerTrumbore + { + __forceinline QuadMIntersector1MoellerTrumbore() {} + + __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {} + + __forceinline void intersect(RayHit& ray, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, + const vuint& geomID, const vuint& primID) const + { + MoellerTrumboreHitM hit; + MoellerTrumboreIntersector1 intersector(ray,nullptr); + Intersect1EpilogM epilog(ray,context,geomID,primID); + + /* intersect first triangle */ + if (intersector.intersect(ray,v0,v1,v3,hit)) + epilog(hit.valid,hit); + + /* intersect second triangle */ + if (intersector.intersect(ray,v2,v3,v1,hit)) + { + hit.U = hit.absDen - hit.U; + hit.V = hit.absDen - hit.V; + epilog(hit.valid,hit); + } + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, + const vuint& geomID, const vuint& primID) const + { + MoellerTrumboreHitM hit; + MoellerTrumboreIntersector1 intersector(ray,nullptr); + Occluded1EpilogM epilog(ray,context,geomID,primID); + + /* intersect first triangle */ + if (intersector.intersect(ray,v0,v1,v3,hit)) + { + if (epilog(hit.valid,hit)) + return true; + } + + /* intersect second triangle */ + if (intersector.intersect(ray,v2,v3,v1,hit)) + { + hit.U = hit.absDen - hit.U; + hit.V = hit.absDen - hit.V; + if (epilog(hit.valid,hit)) + return true; + } + return false; + } + }; + +#if defined(__AVX512ER__) // KNL + + /*! Intersects 4 quads with 1 ray using AVX512 */ + template + struct QuadMIntersector1MoellerTrumbore<4,filter> + { + __forceinline QuadMIntersector1MoellerTrumbore() {} + + __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {} + + template + __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const + { + const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)), + select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)), + select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z))); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z)); + const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z)); +#else + const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)), + select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)), + select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z))); + const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)), + select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)), + select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z))); +#endif + const vbool16 flags(0xf0f0); + + MoellerTrumboreHitM<16> hit; + MoellerTrumboreIntersector1<16> intersector(ray,nullptr); + if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit))) + { + vfloat16 U = hit.U, V = hit.V, absDen = hit.absDen; +#if !defined(EMBREE_BACKFACE_CULLING) + hit.U = select(flags,absDen-V,U); + hit.V = select(flags,absDen-U,V); + hit.vNg *= select(flags,vfloat16(-1.0f),vfloat16(1.0f)); // FIXME: use XOR +#else + hit.U = select(flags,absDen-U,U); + hit.V = select(flags,absDen-V,V); +#endif + if (likely(epilog(hit.valid,hit))) + return true; + } + return false; + } + + __forceinline bool intersect(RayHit& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID))); + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID))); + } + }; + +#elif defined(__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template + struct QuadMIntersector1MoellerTrumbore<4,filter> + { + __forceinline QuadMIntersector1MoellerTrumbore() {} + + __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {} + + template + __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + MoellerTrumboreHitM<8> hit; + MoellerTrumboreIntersector1<8> intersector(ray,nullptr); + const vbool8 flags(0,0,0,0,1,1,1,1); + if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit))) + { + vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen; + +#if !defined(EMBREE_BACKFACE_CULLING) + hit.U = select(flags,absDen-V,U); + hit.V = select(flags,absDen-U,V); + hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); // FIXME: use XOR +#else + hit.U = select(flags,absDen-U,U); + hit.V = select(flags,absDen-V,V); +#endif + if (unlikely(epilog(hit.valid,hit))) + return true; + } + return false; + } + + __forceinline bool intersect(RayHit& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID))); + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID))); + } + }; + +#endif + + /* ----------------------------- */ + /* -- ray packet intersectors -- */ + /* ----------------------------- */ + + + struct MoellerTrumboreIntersector1KTriangleM + { + /*! Intersect k'th ray from ray packet of size K with M triangles. */ + template + static __forceinline bool intersect(RayK& ray, + size_t k, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + const Vec3vf& tri_Ng, + const vbool& flags, + const Epilog& epilog) + { + /* calculate denominator */ + const Vec3vf O = broadcast>(ray.org,k); + const Vec3vf D = broadcast>(ray.dir,k); + const Vec3vf C = Vec3vf(tri_v0) - O; + const Vec3vf R = cross(C,D); + const vfloat den = dot(Vec3vf(tri_Ng),D); + const vfloat absDen = abs(den); + const vfloat sgnDen = signmsk(den); + + /* perform edge tests */ + const vfloat U = dot(R,Vec3vf(tri_e2)) ^ sgnDen; + const vfloat V = dot(R,Vec3vf(tri_e1)) ^ sgnDen; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + vbool valid = (den < vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#else + vbool valid = (den != vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#endif + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat T = dot(Vec3vf(tri_Ng),C) ^ sgnDen; + valid &= (absDen*vfloat(ray.tnear()[k]) < T) & (T <= absDen*vfloat(ray.tfar[k])); + if (likely(none(valid))) return false; + + /* calculate hit information */ + QuadHitM hit(valid,U,V,T,absDen,tri_Ng,flags); + return epilog(valid,hit); + } + + template + static __forceinline bool intersect1(RayK& ray, + size_t k, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const vbool& flags, + const Epilog& epilog) + { + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v2-v0; + const Vec3vf Ng = cross(e2,e1); + return intersect(ray,k,v0,e1,e2,Ng,flags,epilog); + } + }; + + template + struct QuadMIntersectorKMoellerTrumboreBase + { + __forceinline QuadMIntersectorKMoellerTrumboreBase(const vbool& valid, const RayK& ray) {} + + /*! Intersects K rays with one of M triangles. */ + template + __forceinline vbool intersectK(const vbool& valid0, + RayK& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + const Vec3vf& tri_Ng, + const vbool& flags, + const Epilog& epilog) const + { + /* calculate denominator */ + vbool valid = valid0; + const Vec3vf C = tri_v0 - ray.org; + const Vec3vf R = cross(C,ray.dir); + const vfloat den = dot(tri_Ng,ray.dir); + const vfloat absDen = abs(den); + const vfloat sgnDen = signmsk(den); + + /* test against edge p2 p0 */ + const vfloat U = dot(R,tri_e2) ^ sgnDen; + valid &= U >= 0.0f; + if (likely(none(valid))) return false; + + /* test against edge p0 p1 */ + const vfloat V = dot(R,tri_e1) ^ sgnDen; + valid &= V >= 0.0f; + if (likely(none(valid))) return false; + + /* test against edge p1 p2 */ + const vfloat W = absDen-U-V; + valid &= W >= 0.0f; + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat T = dot(tri_Ng,C) ^ sgnDen; + valid &= (absDen*ray.tnear() < T) & (T <= absDen*ray.tfar); + if (unlikely(none(valid))) return false; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + valid &= den < vfloat(zero); + if (unlikely(none(valid))) return false; +#else + valid &= den != vfloat(zero); + if (unlikely(none(valid))) return false; +#endif + + /* calculate hit information */ + QuadHitK hit(U,V,T,absDen,tri_Ng,flags); + return epilog(valid,hit); + } + + /*! Intersects K rays with one of M quads. */ + template + __forceinline vbool intersectK(const vbool& valid0, + RayK& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_v1, + const Vec3vf& tri_v2, + const vbool& flags, + const Epilog& epilog) const + { + const Vec3vf e1 = tri_v0-tri_v1; + const Vec3vf e2 = tri_v2-tri_v0; + const Vec3vf Ng = cross(e2,e1); + return intersectK(valid0,ray,tri_v0,e1,e2,Ng,flags,epilog); + } + + /*! Intersects K rays with one of M quads. */ + template + __forceinline bool intersectK(const vbool& valid0, + RayK& ray, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const Vec3vf& v3, + const Epilog& epilog) const + { + intersectK(valid0,ray,v0,v1,v3,vbool(false),epilog); + if (none(valid0)) return true; + intersectK(valid0,ray,v2,v3,v1,vbool(true ),epilog); + return none(valid0); + } + }; + + template + struct QuadMIntersectorKMoellerTrumbore : public QuadMIntersectorKMoellerTrumboreBase + { + __forceinline QuadMIntersectorKMoellerTrumbore(const vbool& valid, const RayK& ray) + : QuadMIntersectorKMoellerTrumboreBase(valid,ray) {} + + __forceinline void intersect1(RayHitK& ray, size_t k, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, + const vuint& geomID, const vuint& primID) const + { + Intersect1KEpilogM epilog(ray,k,context,geomID,primID); + MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool(false),epilog); + MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool(true ),epilog); + } + + __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, + const vuint& geomID, const vuint& primID) const + { + Occluded1KEpilogM epilog(ray,k,context,geomID,primID); + if (MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool(false),epilog)) return true; + if (MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool(true ),epilog)) return true; + return false; + } + }; + + +#if defined(__AVX512ER__) // KNL + + /*! Intersects 4 quads with 1 ray using AVX512 */ + template + struct QuadMIntersectorKMoellerTrumbore<4,K,filter> : public QuadMIntersectorKMoellerTrumboreBase<4,K,filter> + { + __forceinline QuadMIntersectorKMoellerTrumbore(const vbool& valid, const RayK& ray) + : QuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {} + + template + __forceinline bool intersect1(RayK& ray, size_t k, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const + { + const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)), + select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)), + select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z))); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z)); + const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z)); +#else + const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)), + select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)), + select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z))); + const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)), + select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)), + select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z))); +#endif + const vbool16 flags(0xf0f0); + return MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog); + } + + __forceinline bool intersect1(RayHitK& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); + } + + __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); + } + }; + +#elif defined(__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template + struct QuadMIntersectorKMoellerTrumbore<4,K,filter> : public QuadMIntersectorKMoellerTrumboreBase<4,K,filter> + { + __forceinline QuadMIntersectorKMoellerTrumbore(const vbool& valid, const RayK& ray) + : QuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {} + + template + __forceinline bool intersect1(RayK& ray, size_t k, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + const vbool8 flags(0,0,0,0,1,1,1,1); + return MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog); + } + + __forceinline bool intersect1(RayHitK& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); + } + + __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); + } + }; + +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h new file mode 100644 index 00000000000..7ca3aed0a04 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h @@ -0,0 +1,529 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "quad_intersector_moeller.h" + +/*! Modified Pluecker ray/triangle intersector. The test first shifts + * the ray origin into the origin of the coordinate system and then + * uses Pluecker coordinates for the intersection. Due to the shift, + * the Pluecker coordinate calculation simplifies and the tests get + * numerically stable. The edge equations are watertight along the + * edge for neighboring triangles. */ + +namespace embree +{ + namespace isa + { + template + struct QuadHitPlueckerM + { + __forceinline QuadHitPlueckerM() {} + + __forceinline QuadHitPlueckerM(const vbool& valid, + const vfloat& U, + const vfloat& V, + const vfloat& UVW, + const vfloat& t, + const Vec3vf& Ng, + const vbool& flags) + : U(U), V(V), UVW(UVW), tri_Ng(Ng), valid(valid), vt(t), flags(flags) {} + + __forceinline void finalize() + { + const vbool invalid = abs(UVW) < min_rcp_input; + const vfloat rcpUVW = select(invalid,vfloat(0.0f),rcp(UVW)); + const vfloat u = min(U * rcpUVW,1.0f); + const vfloat v = min(V * rcpUVW,1.0f); + const vfloat u1 = vfloat(1.0f) - u; + const vfloat v1 = vfloat(1.0f) - v; +#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING) + vu = select(flags,u1,u); + vv = select(flags,v1,v); + vNg = Vec3vf(tri_Ng.x,tri_Ng.y,tri_Ng.z); +#else + const vfloat flip = select(flags,vfloat(-1.0f),vfloat(1.0f)); + vv = select(flags,u1,v); + vu = select(flags,v1,u); + vNg = Vec3vf(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z); +#endif + } + + __forceinline Vec2f uv(const size_t i) + { + const float u = vu[i]; + const float v = vv[i]; + return Vec2f(u,v); + } + + __forceinline float t(const size_t i) { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + private: + vfloat U; + vfloat V; + vfloat UVW; + Vec3vf tri_Ng; + + public: + vbool valid; + vfloat vu; + vfloat vv; + vfloat vt; + Vec3vf vNg; + + public: + const vbool flags; + }; + + template + struct QuadHitPlueckerK + { + __forceinline QuadHitPlueckerK(const vfloat& U, + const vfloat& V, + const vfloat& UVW, + const vfloat& t, + const Vec3vf& Ng, + const vbool& flags) + : U(U), V(V), UVW(UVW), t(t), flags(flags), tri_Ng(Ng) {} + + __forceinline std::tuple,vfloat,vfloat,Vec3vf> operator() () const + { + const vbool invalid = abs(UVW) < min_rcp_input; + const vfloat rcpUVW = select(invalid,vfloat(0.0f),rcp(UVW)); + const vfloat u0 = min(U * rcpUVW,1.0f); + const vfloat v0 = min(V * rcpUVW,1.0f); + const vfloat u1 = vfloat(1.0f) - u0; + const vfloat v1 = vfloat(1.0f) - v0; + const vfloat u = select(flags,u1,u0); + const vfloat v = select(flags,v1,v0); + const Vec3vf Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z); + return std::make_tuple(u,v,t,Ng); + } + + private: + const vfloat U; + const vfloat V; + const vfloat UVW; + const vfloat t; + const vbool flags; + const Vec3vf tri_Ng; + }; + + struct PlueckerIntersectorTriangle1 + { + template + static __forceinline bool intersect(Ray& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_v1, + const Vec3vf& tri_v2, + const vbool& flags, + const Epilog& epilog) + { + /* calculate vertices relative to ray origin */ + const Vec3vf O = Vec3vf((Vec3fa)ray.org); + const Vec3vf D = Vec3vf((Vec3fa)ray.dir); + const Vec3vf v0 = tri_v0-O; + const Vec3vf v1 = tri_v1-O; + const Vec3vf v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf e0 = v2-v0; + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v1-v2; + + /* perform edge tests */ + const vfloat U = dot(cross(e0,v2+v0),D); + const vfloat V = dot(cross(e1,v0+v1),D); + const vfloat W = dot(cross(e2,v1+v2),D); + const vfloat UVW = U+V+W; + const vfloat eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + vbool valid = max(U,V,W) <= eps; +#else + vbool valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); + const vfloat den = twice(dot(Ng,D)); + + /* perform depth test */ + const vfloat T = twice(dot(v0,Ng)); + const vfloat t = rcp(den)*T; + valid &= vfloat(ray.tnear()) <= t & t <= vfloat(ray.tfar); + valid &= den != vfloat(zero); + if (unlikely(none(valid))) return false; + + /* update hit information */ + QuadHitPlueckerM hit(valid,U,V,UVW,t,Ng,flags); + return epilog(valid,hit); + } + }; + + /*! Intersects M quads with 1 ray */ + template + struct QuadMIntersector1Pluecker + { + __forceinline QuadMIntersector1Pluecker() {} + + __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {} + + __forceinline void intersect(RayHit& ray, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, + const vuint& geomID, const vuint& primID) const + { + Intersect1EpilogM epilog(ray,context,geomID,primID); + PlueckerIntersectorTriangle1::intersect(ray,v0,v1,v3,vbool(false),epilog); + PlueckerIntersectorTriangle1::intersect(ray,v2,v3,v1,vbool(true),epilog); + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, + const vuint& geomID, const vuint& primID) const + { + Occluded1EpilogM epilog(ray,context,geomID,primID); + if (PlueckerIntersectorTriangle1::intersect(ray,v0,v1,v3,vbool(false),epilog)) return true; + if (PlueckerIntersectorTriangle1::intersect(ray,v2,v3,v1,vbool(true ),epilog)) return true; + return false; + } + }; + +#if defined(__AVX512ER__) // KNL + + /*! Intersects 4 quads with 1 ray using AVX512 */ + template + struct QuadMIntersector1Pluecker<4,filter> + { + __forceinline QuadMIntersector1Pluecker() {} + + __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {} + + template + __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const + { + const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)), + select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)), + select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z))); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z)); + const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z)); +#else + const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)), + select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)), + select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z))); + const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)), + select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)), + select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z))); +#endif + const vbool16 flags(0xf0f0); + return PlueckerIntersectorTriangle1::intersect(ray,vtx0,vtx1,vtx2,flags,epilog); + } + + __forceinline bool intersect(RayHit& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID))); + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID))); + } + }; + +#elif defined(__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template + struct QuadMIntersector1Pluecker<4,filter> + { + __forceinline QuadMIntersector1Pluecker() {} + + __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {} + + template + __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + const vbool8 flags(0,0,0,0,1,1,1,1); + return PlueckerIntersectorTriangle1::intersect(ray,vtx0,vtx1,vtx2,flags,epilog); + } + + __forceinline bool intersect(RayHit& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID))); + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID))); + } + }; + +#endif + + + /* ----------------------------- */ + /* -- ray packet intersectors -- */ + /* ----------------------------- */ + + struct PlueckerIntersector1KTriangleM + { + /*! Intersect k'th ray from ray packet of size K with M triangles. */ + template + static __forceinline bool intersect1(RayK& ray, + size_t k, + const Vec3vf& tri_v0, + const Vec3vf& tri_v1, + const Vec3vf& tri_v2, + const vbool& flags, + const Epilog& epilog) + { + /* calculate vertices relative to ray origin */ + const Vec3vf O = broadcast>(ray.org,k); + const Vec3vf D = broadcast>(ray.dir,k); + const Vec3vf v0 = tri_v0-O; + const Vec3vf v1 = tri_v1-O; + const Vec3vf v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf e0 = v2-v0; + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v1-v2; + + /* perform edge tests */ + const vfloat U = dot(cross(e0,v2+v0),D); + const vfloat V = dot(cross(e1,v0+v1),D); + const vfloat W = dot(cross(e2,v1+v2),D); + const vfloat UVW = U+V+W; + const vfloat eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + vbool valid = max(U,V,W) <= eps; +#else + vbool valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); + const vfloat den = twice(dot(Ng,D)); + + /* perform depth test */ + const vfloat T = twice(dot(v0,Ng)); + const vfloat t = rcp(den)*T; + valid &= vfloat(ray.tnear()[k]) <= t & t <= vfloat(ray.tfar[k]); + if (unlikely(none(valid))) return false; + + /* avoid division by 0 */ + valid &= den != vfloat(zero); + if (unlikely(none(valid))) return false; + + /* update hit information */ + QuadHitPlueckerM hit(valid,U,V,UVW,t,Ng,flags); + return epilog(valid,hit); + } + }; + + template + struct QuadMIntersectorKPlueckerBase + { + __forceinline QuadMIntersectorKPlueckerBase(const vbool& valid, const RayK& ray) {} + + /*! Intersects K rays with one of M triangles. */ + template + __forceinline vbool intersectK(const vbool& valid0, + RayK& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_v1, + const Vec3vf& tri_v2, + const vbool& flags, + const Epilog& epilog) const + { + /* calculate vertices relative to ray origin */ + vbool valid = valid0; + const Vec3vf O = ray.org; + const Vec3vf D = ray.dir; + const Vec3vf v0 = tri_v0-O; + const Vec3vf v1 = tri_v1-O; + const Vec3vf v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf e0 = v2-v0; + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v1-v2; + + /* perform edge tests */ + const vfloat U = dot(Vec3vf(cross(e0,v2+v0)),D); + const vfloat V = dot(Vec3vf(cross(e1,v0+v1)),D); + const vfloat W = dot(Vec3vf(cross(e2,v1+v2)),D); + const vfloat UVW = U+V+W; + const vfloat eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + valid &= max(U,V,W) <= eps; +#else + valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); + const vfloat den = twice(dot(Vec3vf(Ng),D)); + + /* perform depth test */ + const vfloat T = twice(dot(v0,Vec3vf(Ng))); + const vfloat t = rcp(den)*T; + valid &= ray.tnear() <= t & t <= ray.tfar; + valid &= den != vfloat(zero); + if (unlikely(none(valid))) return false; + + /* calculate hit information */ + QuadHitPlueckerK hit(U,V,UVW,t,Ng,flags); + return epilog(valid,hit); + } + + /*! Intersects K rays with one of M quads. */ + template + __forceinline bool intersectK(const vbool& valid0, + RayK& ray, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const Vec3vf& v3, + const Epilog& epilog) const + { + intersectK(valid0,ray,v0,v1,v3,vbool(false),epilog); + if (none(valid0)) return true; + intersectK(valid0,ray,v2,v3,v1,vbool(true ),epilog); + return none(valid0); + } + }; + + template + struct QuadMIntersectorKPluecker : public QuadMIntersectorKPlueckerBase + { + __forceinline QuadMIntersectorKPluecker(const vbool& valid, const RayK& ray) + : QuadMIntersectorKPlueckerBase(valid,ray) {} + + __forceinline void intersect1(RayHitK& ray, size_t k, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, + const vuint& geomID, const vuint& primID) const + { + Intersect1KEpilogM epilog(ray,k,context,geomID,primID); + PlueckerIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool(false),epilog); + PlueckerIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool(true ),epilog); + } + + __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, + const vuint& geomID, const vuint& primID) const + { + Occluded1KEpilogM epilog(ray,k,context,geomID,primID); + if (PlueckerIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool(false),epilog)) return true; + if (PlueckerIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool(true ),epilog)) return true; + return false; + } + }; + +#if defined(__AVX512ER__) // KNL + + /*! Intersects 4 quads with 1 ray using AVX512 */ + template + struct QuadMIntersectorKPluecker<4,K,filter> : public QuadMIntersectorKPlueckerBase<4,K,filter> + { + __forceinline QuadMIntersectorKPluecker(const vbool& valid, const RayK& ray) + : QuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {} + + template + __forceinline bool intersect1(RayK& ray, size_t k, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const + { + const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)), + select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)), + select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z))); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z)); + const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z)); +#else + const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)), + select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)), + select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z))); + const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)), + select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)), + select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z))); +#endif + + const vbool16 flags(0xf0f0); + return PlueckerIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog); + } + + __forceinline bool intersect1(RayHitK& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); + } + + __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); + } + }; + +#elif defined(__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template + struct QuadMIntersectorKPluecker<4,K,filter> : public QuadMIntersectorKPlueckerBase<4,K,filter> + { + __forceinline QuadMIntersectorKPluecker(const vbool& valid, const RayK& ray) + : QuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {} + + template + __forceinline bool intersect1(RayK& ray, size_t k, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); + const vbool8 flags(0,0,0,0,1,1,1,1); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + return PlueckerIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog); + } + + __forceinline bool intersect1(RayHitK& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); + } + + __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const vuint4& geomID, const vuint4& primID) const + { + return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID))); + } + }; + +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadi.h b/thirdparty/embree-aarch64/kernels/geometry/quadi.h new file mode 100644 index 00000000000..741ec519ab4 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/quadi.h @@ -0,0 +1,483 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" +#include "../common/scene.h" + +namespace embree +{ + /* Stores M quads from an indexed face set */ + template + struct QuadMi + { + /* Virtual interface to query information about the quad type */ + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* primitive supports multiple time segments */ + static const bool singleTimeSegment = false; + + /* Returns maximum number of stored quads */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + public: + + /* Default constructor */ + __forceinline QuadMi() { } + + /* Construction from vertices and IDs */ + __forceinline QuadMi(const vuint& v0, + const vuint& v1, + const vuint& v2, + const vuint& v3, + const vuint& geomIDs, + const vuint& primIDs) +#if defined(EMBREE_COMPACT_POLYS) + : geomIDs(geomIDs), primIDs(primIDs) {} +#else + : v0_(v0),v1_(v1), v2_(v2), v3_(v3), geomIDs(geomIDs), primIDs(primIDs) {} +#endif + + /* Returns a mask that tells which quads are valid */ + __forceinline vbool valid() const { return primIDs != vuint(-1); } + + /* Returns if the specified quad is valid */ + __forceinline bool valid(const size_t i) const { assert(i& geomID() { return geomIDs; } + __forceinline const vuint& geomID() const { return geomIDs; } + __forceinline unsigned int geomID(const size_t i) const { assert(i& primID() { return primIDs; } + __forceinline const vuint& primID() const { return primIDs; } + __forceinline unsigned int primID(const size_t i) const { assert(iget(geomID(i)); + bounds.extend(mesh->bounds(primID(i),itime)); + } + return bounds; + } + + /* Calculate the linear bounds of the primitive */ + __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime) { + return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1)); + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) + { + LBBox3fa allBounds = empty; + for (size_t i=0; iget(geomID(i)); + allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps)); + } + return allBounds; + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) + { + LBBox3fa allBounds = empty; + for (size_t i=0; iget(geomID(i)); + allBounds.extend(mesh->linearBounds(primID(i), time_range)); + } + return allBounds; + } + + /* Fill quad from quad list */ + template + __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene) + { + vuint geomID = -1, primID = -1; + const PrimRefT* prim = &prims[begin]; + vuint v0 = zero, v1 = zero, v2 = zero, v3 = zero; + + for (size_t i=0; igeomID(); + primID[i] = prim->primID(); +#if !defined(EMBREE_COMPACT_POLYS) + const QuadMesh* mesh = scene->get(prim->geomID()); + const QuadMesh::Quad& q = mesh->quad(prim->primID()); + unsigned int_stride = mesh->vertices0.getStride()/4; + v0[i] = q.v[0] * int_stride; + v1[i] = q.v[1] * int_stride; + v2[i] = q.v[2] * int_stride; + v3[i] = q.v[3] * int_stride; +#endif + begin++; + } else { + assert(i); + if (likely(i > 0)) { + geomID[i] = geomID[0]; // always valid geomIDs + primID[i] = -1; // indicates invalid data + v0[i] = v0[0]; + v1[i] = v0[0]; + v2[i] = v0[0]; + v3[i] = v0[0]; + } + } + if (begin( " +#if !defined(EMBREE_COMPACT_POLYS) + << "v0 = " << quad.v0_ << ", v1 = " << quad.v1_ << ", v2 = " << quad.v2_ << ", v3 = " << quad.v3_ << ", " +#endif + << "geomID = " << quad.geomIDs << ", primID = " << quad.primIDs << " )"; + } + + protected: +#if !defined(EMBREE_COMPACT_POLYS) + vuint v0_; // 4 byte offset of 1st vertex + vuint v1_; // 4 byte offset of 2nd vertex + vuint v2_; // 4 byte offset of 3rd vertex + vuint v3_; // 4 byte offset of 4th vertex +#endif + vuint geomIDs; // geometry ID of mesh + vuint primIDs; // primitive ID of primitive inside mesh + }; + + namespace isa + { + + template + struct QuadMi : public embree::QuadMi + { +#if !defined(EMBREE_COMPACT_POLYS) + using embree::QuadMi::v0_; + using embree::QuadMi::v1_; + using embree::QuadMi::v2_; + using embree::QuadMi::v3_; +#endif + using embree::QuadMi::geomIDs; + using embree::QuadMi::primIDs; + using embree::QuadMi::geomID; + using embree::QuadMi::primID; + using embree::QuadMi::valid; + + template + __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const + { +#if defined(EMBREE_COMPACT_POLYS) + const QuadMesh* mesh = scene->get(geomID(index)); + const QuadMesh::Quad& quad = mesh->quad(primID(index)); + return (Vec3f) mesh->vertices[0][quad.v[vid]]; +#else + const vuint& v = getVertexOffset(); + const float* vertices = scene->vertices[geomID(index)]; + return (Vec3f&) vertices[v[index]]; +#endif + } + + template + __forceinline Vec3 getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const + { +#if defined(EMBREE_COMPACT_POLYS) + const QuadMesh* mesh = scene->get(geomID(index)); + const QuadMesh::Quad& quad = mesh->quad(primID(index)); + const Vec3fa v0 = mesh->vertices[itime+0][quad.v[vid]]; + const Vec3fa v1 = mesh->vertices[itime+1][quad.v[vid]]; +#else + const vuint& v = getVertexOffset(); + const QuadMesh* mesh = scene->get(geomID(index)); + const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0); + const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1); + const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]); + const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]); +#endif + const Vec3 p0(v0.x,v0.y,v0.z); + const Vec3 p1(v1.x,v1.y,v1.z); + return lerp(p0,p1,ftime); + } + + template + __forceinline Vec3 getVertex(const vbool& valid, const size_t index, const Scene *const scene, const vint& itime, const T& ftime) const + { + Vec3 p0, p1; + const QuadMesh* mesh = scene->get(geomID(index)); + + for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask)) + { +#if defined(EMBREE_COMPACT_POLYS) + const QuadMesh::Quad& quad = mesh->quad(primID(index)); + const Vec3fa v0 = mesh->vertices[itime[i]+0][quad.v[vid]]; + const Vec3fa v1 = mesh->vertices[itime[i]+1][quad.v[vid]]; +#else + const vuint& v = getVertexOffset(); + const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0); + const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1); + const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]); + const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]); +#endif + p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z; + p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z; + } + return (T(one)-ftime)*p0 + ftime*p1; + } + + struct Quad { + vfloat4 v0,v1,v2,v3; + }; + +#if defined(EMBREE_COMPACT_POLYS) + + __forceinline Quad loadQuad(const int i, const Scene* const scene) const + { + const unsigned int geomID = geomIDs[i]; + const unsigned int primID = primIDs[i]; + if (unlikely(primID == -1)) return { zero, zero, zero, zero }; + const QuadMesh* mesh = scene->get(geomID); + const QuadMesh::Quad& quad = mesh->quad(primID); + const vfloat4 v0 = (vfloat4) mesh->vertices0[quad.v[0]]; + const vfloat4 v1 = (vfloat4) mesh->vertices0[quad.v[1]]; + const vfloat4 v2 = (vfloat4) mesh->vertices0[quad.v[2]]; + const vfloat4 v3 = (vfloat4) mesh->vertices0[quad.v[3]]; + return { v0, v1, v2, v3 }; + } + + __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const + { + const unsigned int geomID = geomIDs[i]; + const unsigned int primID = primIDs[i]; + if (unlikely(primID == -1)) return { zero, zero, zero, zero }; + const QuadMesh* mesh = scene->get(geomID); + const QuadMesh::Quad& quad = mesh->quad(primID); + const vfloat4 v0 = (vfloat4) mesh->vertices[itime][quad.v[0]]; + const vfloat4 v1 = (vfloat4) mesh->vertices[itime][quad.v[1]]; + const vfloat4 v2 = (vfloat4) mesh->vertices[itime][quad.v[2]]; + const vfloat4 v3 = (vfloat4) mesh->vertices[itime][quad.v[3]]; + return { v0, v1, v2, v3 }; + } + +#else + + __forceinline Quad loadQuad(const int i, const Scene* const scene) const + { + const float* vertices = scene->vertices[geomID(i)]; + const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]); + const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]); + const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]); + const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]); + return { v0, v1, v2, v3 }; + } + + __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const + { + const unsigned int geomID = geomIDs[i]; + const QuadMesh* mesh = scene->get(geomID); + const float* vertices = (const float*) mesh->vertexPtr(0,itime); + const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]); + const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]); + const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]); + const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]); + return { v0, v1, v2, v3 }; + } + +#endif + + /* Gather the quads */ + __forceinline void gather(Vec3vf& p0, + Vec3vf& p1, + Vec3vf& p2, + Vec3vf& p3, + const Scene *const scene) const; + +#if defined(__AVX512F__) + __forceinline void gather(Vec3vf16& p0, + Vec3vf16& p1, + Vec3vf16& p2, + Vec3vf16& p3, + const Scene *const scene) const; +#endif + + template +#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019 + __noinline +#else + __forceinline +#endif + void gather(const vbool& valid, + Vec3vf& p0, + Vec3vf& p1, + Vec3vf& p2, + Vec3vf& p3, + const size_t index, + const Scene* const scene, + const vfloat& time) const + { + const QuadMesh* mesh = scene->get(geomID(index)); + + vfloat ftime; + const vint itime = mesh->timeSegment(time, ftime); + + const size_t first = bsf(movemask(valid)); + if (likely(all(valid,itime[first] == itime))) + { + p0 = getVertex<0>(index, scene, itime[first], ftime); + p1 = getVertex<1>(index, scene, itime[first], ftime); + p2 = getVertex<2>(index, scene, itime[first], ftime); + p3 = getVertex<3>(index, scene, itime[first], ftime); + } + else + { + p0 = getVertex<0>(valid, index, scene, itime, ftime); + p1 = getVertex<1>(valid, index, scene, itime, ftime); + p2 = getVertex<2>(valid, index, scene, itime, ftime); + p3 = getVertex<3>(valid, index, scene, itime, ftime); + } + } + + __forceinline void gather(Vec3vf& p0, + Vec3vf& p1, + Vec3vf& p2, + Vec3vf& p3, + const QuadMesh* mesh, + const Scene *const scene, + const int itime) const; + + __forceinline void gather(Vec3vf& p0, + Vec3vf& p1, + Vec3vf& p2, + Vec3vf& p3, + const Scene *const scene, + const float time) const; + + /* Updates the primitive */ + __forceinline BBox3fa update(QuadMesh* mesh) + { + BBox3fa bounds = empty; + for (size_t i=0; iquad(primId); + const Vec3fa p0 = mesh->vertex(q.v[0]); + const Vec3fa p1 = mesh->vertex(q.v[1]); + const Vec3fa p2 = mesh->vertex(q.v[2]); + const Vec3fa p3 = mesh->vertex(q.v[3]); + bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3))); + } + return bounds; + } + + private: +#if !defined(EMBREE_COMPACT_POLYS) + template const vuint& getVertexOffset() const; +#endif + }; + +#if !defined(EMBREE_COMPACT_POLYS) + template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<0>() const { return v0_; } + template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<1>() const { return v1_; } + template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<2>() const { return v2_; } + template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<3>() const { return v3_; } +#endif + + template<> + __forceinline void QuadMi<4>::gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + Vec3vf4& p3, + const Scene *const scene) const + { + prefetchL1(((char*)this)+0*64); + prefetchL1(((char*)this)+1*64); + const Quad tri0 = loadQuad(0,scene); + const Quad tri1 = loadQuad(1,scene); + const Quad tri2 = loadQuad(2,scene); + const Quad tri3 = loadQuad(3,scene); + transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z); + transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z); + transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z); + transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z); + } + + template<> + __forceinline void QuadMi<4>::gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + Vec3vf4& p3, + const QuadMesh* mesh, + const Scene *const scene, + const int itime) const + { + // FIXME: for trianglei there all geometries are identical, is this the case here too? + + const Quad tri0 = loadQuad(0,itime,scene); + const Quad tri1 = loadQuad(1,itime,scene); + const Quad tri2 = loadQuad(2,itime,scene); + const Quad tri3 = loadQuad(3,itime,scene); + transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z); + transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z); + transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z); + transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z); + } + + template<> + __forceinline void QuadMi<4>::gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + Vec3vf4& p3, + const Scene *const scene, + const float time) const + { + const QuadMesh* mesh = scene->get(geomID(0)); // in mblur mode all geometries are identical + + float ftime; + const int itime = mesh->timeSegment(time, ftime); + + Vec3vf4 a0,a1,a2,a3; gather(a0,a1,a2,a3,mesh,scene,itime); + Vec3vf4 b0,b1,b2,b3; gather(b0,b1,b2,b3,mesh,scene,itime+1); + p0 = lerp(a0,b0,vfloat4(ftime)); + p1 = lerp(a1,b1,vfloat4(ftime)); + p2 = lerp(a2,b2,vfloat4(ftime)); + p3 = lerp(a3,b3,vfloat4(ftime)); + } + } + + template + typename QuadMi::Type QuadMi::type; + + typedef QuadMi<4> Quad4i; +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h new file mode 100644 index 00000000000..96cf7f1ca2d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h @@ -0,0 +1,350 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "quadi.h" +#include "quad_intersector_moeller.h" +#include "quad_intersector_pluecker.h" + +namespace embree +{ + namespace isa + { + /*! Intersects M quads with 1 ray */ + template + struct QuadMiIntersector1Moeller + { + typedef QuadMi Primitive; + typedef QuadMIntersector1MoellerTrumbore Precalculations; + + /*! Intersect a ray with the M quads and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of M quads. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) + { + return PrimitivePointQuery1::pointQuery(query, context, quad); + } + }; + + /*! Intersects M triangles with K rays. */ + template + struct QuadMiIntersectorKMoeller + { + typedef QuadMi Primitive; + typedef QuadMIntersectorKMoellerTrumbore Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const QuadMi& quad) + { + Scene* scene = context->scene; + for (size_t i=0; i::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf p0 = quad.template getVertex<0>(i,scene); + const Vec3vf p1 = quad.template getVertex<1>(i,scene); + const Vec3vf p2 = quad.template getVertex<2>(i,scene); + const Vec3vf p3 = quad.template getVertex<3>(i,scene); + pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM(ray,context,quad.geomID(),quad.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const QuadMi& quad) + { + Scene* scene = context->scene; + vbool valid0 = valid_i; + for (size_t i=0; i::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + const Vec3vf p0 = quad.template getVertex<0>(i,scene); + const Vec3vf p1 = quad.template getVertex<1>(i,scene); + const Vec3vf p2 = quad.template getVertex<2>(i,scene); + const Vec3vf p3 = quad.template getVertex<3>(i,scene); + if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM(valid0,ray,context,quad.geomID(),quad.primID(),i))) + break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const QuadMi& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const QuadMi& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + }; + + /*! Intersects M quads with 1 ray */ + template + struct QuadMiIntersector1Pluecker + { + typedef QuadMi Primitive; + typedef QuadMIntersector1Pluecker Precalculations; + + /*! Intersect a ray with the M quads and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of M quads. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) + { + return PrimitivePointQuery1::pointQuery(query, context, quad); + } + }; + + /*! Intersects M triangles with K rays. */ + template + struct QuadMiIntersectorKPluecker + { + typedef QuadMi Primitive; + typedef QuadMIntersectorKPluecker Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const QuadMi& quad) + { + Scene* scene = context->scene; + for (size_t i=0; i::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf p0 = quad.template getVertex<0>(i,scene); + const Vec3vf p1 = quad.template getVertex<1>(i,scene); + const Vec3vf p2 = quad.template getVertex<2>(i,scene); + const Vec3vf p3 = quad.template getVertex<3>(i,scene); + pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM(ray,context,quad.geomID(),quad.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const QuadMi& quad) + { + Scene* scene = context->scene; + vbool valid0 = valid_i; + for (size_t i=0; i::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + const Vec3vf p0 = quad.template getVertex<0>(i,scene); + const Vec3vf p1 = quad.template getVertex<1>(i,scene); + const Vec3vf p2 = quad.template getVertex<2>(i,scene); + const Vec3vf p3 = quad.template getVertex<3>(i,scene); + if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM(valid0,ray,context,quad.geomID(),quad.primID(),i))) + break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const QuadMi& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const QuadMi& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene); + return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + }; + + /*! Intersects M motion blur quads with 1 ray */ + template + struct QuadMiMBIntersector1Moeller + { + typedef QuadMi Primitive; + typedef QuadMIntersector1MoellerTrumbore Precalculations; + + /*! Intersect a ray with the M quads and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()); + pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of M quads. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()); + return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) + { + return PrimitivePointQuery1::pointQuery(query, context, quad); + } + }; + + /*! Intersects M motion blur quads with K rays. */ + template + struct QuadMiMBIntersectorKMoeller + { + typedef QuadMi Primitive; + typedef QuadMIntersectorKMoellerTrumbore Precalculations; + + /*! Intersects K rays with M quads. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const QuadMi& quad) + { + for (size_t i=0; i::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + Vec3vf v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time()); + pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM(ray,context,quad.geomID(),quad.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M quads. */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const QuadMi& quad) + { + vbool valid0 = valid_i; + for (size_t i=0; i::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + Vec3vf v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time()); + if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM(valid0,ray,context,quad.geomID(),quad.primID(),i))) + break; + } + return !valid0; + } + + /*! Intersect a ray with M quads and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const QuadMi& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]); + pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of the M quads. */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const QuadMi& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]); + return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + }; + + /*! Intersects M motion blur quads with 1 ray */ + template + struct QuadMiMBIntersector1Pluecker + { + typedef QuadMi Primitive; + typedef QuadMIntersector1Pluecker Precalculations; + + /*! Intersect a ray with the M quads and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()); + pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of M quads. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()); + return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) + { + return PrimitivePointQuery1::pointQuery(query, context, quad); + } + }; + + /*! Intersects M motion blur quads with K rays. */ + template + struct QuadMiMBIntersectorKPluecker + { + typedef QuadMi Primitive; + typedef QuadMIntersectorKPluecker Precalculations; + + /*! Intersects K rays with M quads. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const QuadMi& quad) + { + for (size_t i=0; i::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + Vec3vf v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time()); + pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM(ray,context,quad.geomID(),quad.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M quads. */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const QuadMi& quad) + { + vbool valid0 = valid_i; + for (size_t i=0; i::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + Vec3vf v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time()); + if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM(valid0,ray,context,quad.geomID(),quad.primID(),i))) + break; + } + return !valid0; + } + + /*! Intersect a ray with M quads and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const QuadMi& quad) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]); + pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of the M quads. */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const QuadMi& quad) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]); + return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID()); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadv.h b/thirdparty/embree-aarch64/kernels/geometry/quadv.h new file mode 100644 index 00000000000..0a1fe4d128a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/quadv.h @@ -0,0 +1,165 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" + +namespace embree +{ + /* Stores the vertices of M quads in struct of array layout */ + template + struct QuadMv + { + public: + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* Returns maximum number of stored quads */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + public: + + /* Default constructor */ + __forceinline QuadMv() {} + + /* Construction from vertices and IDs */ + __forceinline QuadMv(const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, const vuint& geomIDs, const vuint& primIDs) + : v0(v0), v1(v1), v2(v2), v3(v3), geomIDs(geomIDs), primIDs(primIDs) {} + + /* Returns a mask that tells which quads are valid */ + __forceinline vbool valid() const { return geomIDs != vuint(-1); } + + /* Returns true if the specified quad is valid */ + __forceinline bool valid(const size_t i) const { assert(i& geomID() { return geomIDs; } + __forceinline const vuint& geomID() const { return geomIDs; } + __forceinline unsigned int geomID(const size_t i) const { assert(i primID() { return primIDs; } + __forceinline const vuint primID() const { return primIDs; } + __forceinline unsigned int primID(const size_t i) const { assert(i lower = min(v0,v1,v2,v3); + Vec3vf upper = max(v0,v1,v2,v3); + vbool mask = valid(); + lower.x = select(mask,lower.x,vfloat(pos_inf)); + lower.y = select(mask,lower.y,vfloat(pos_inf)); + lower.z = select(mask,lower.z,vfloat(pos_inf)); + upper.x = select(mask,upper.x,vfloat(neg_inf)); + upper.y = select(mask,upper.y,vfloat(neg_inf)); + upper.z = select(mask,upper.z,vfloat(neg_inf)); + return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), + Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); + } + + /* Non temporal store */ + __forceinline static void store_nt(QuadMv* dst, const QuadMv& src) + { + vfloat::store_nt(&dst->v0.x,src.v0.x); + vfloat::store_nt(&dst->v0.y,src.v0.y); + vfloat::store_nt(&dst->v0.z,src.v0.z); + vfloat::store_nt(&dst->v1.x,src.v1.x); + vfloat::store_nt(&dst->v1.y,src.v1.y); + vfloat::store_nt(&dst->v1.z,src.v1.z); + vfloat::store_nt(&dst->v2.x,src.v2.x); + vfloat::store_nt(&dst->v2.y,src.v2.y); + vfloat::store_nt(&dst->v2.z,src.v2.z); + vfloat::store_nt(&dst->v3.x,src.v3.x); + vfloat::store_nt(&dst->v3.y,src.v3.y); + vfloat::store_nt(&dst->v3.z,src.v3.z); + vuint::store_nt(&dst->geomIDs,src.geomIDs); + vuint::store_nt(&dst->primIDs,src.primIDs); + } + + /* Fill quad from quad list */ + __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene) + { + vuint vgeomID = -1, vprimID = -1; + Vec3vf v0 = zero, v1 = zero, v2 = zero, v3 = zero; + + for (size_t i=0; iget(geomID); + const QuadMesh::Quad& quad = mesh->quad(primID); + const Vec3fa& p0 = mesh->vertex(quad.v[0]); + const Vec3fa& p1 = mesh->vertex(quad.v[1]); + const Vec3fa& p2 = mesh->vertex(quad.v[2]); + const Vec3fa& p3 = mesh->vertex(quad.v[3]); + vgeomID [i] = geomID; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z; + } + QuadMv::store_nt(this,QuadMv(v0,v1,v2,v3,vgeomID,vprimID)); + } + + /* Updates the primitive */ + __forceinline BBox3fa update(QuadMesh* mesh) + { + BBox3fa bounds = empty; + vuint vgeomID = -1, vprimID = -1; + Vec3vf v0 = zero, v1 = zero, v2 = zero; + + for (size_t i=0; iquad(primId); + const Vec3fa p0 = mesh->vertex(quad.v[0]); + const Vec3fa p1 = mesh->vertex(quad.v[1]); + const Vec3fa p2 = mesh->vertex(quad.v[2]); + const Vec3fa p3 = mesh->vertex(quad.v[3]); + bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3))); + vgeomID [i] = geomId; + vprimID [i] = primId; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z; + } + new (this) QuadMv(v0,v1,v2,v3,vgeomID,vprimID); + return bounds; + } + + public: + Vec3vf v0; // 1st vertex of the quads + Vec3vf v1; // 2nd vertex of the quads + Vec3vf v2; // 3rd vertex of the quads + Vec3vf v3; // 4rd vertex of the quads + private: + vuint geomIDs; // geometry ID + vuint primIDs; // primitive ID + }; + + template + typename QuadMv::Type QuadMv::type; + + typedef QuadMv<4> Quad4v; +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h new file mode 100644 index 00000000000..30a24b291a7 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h @@ -0,0 +1,181 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "quadv.h" +#include "quad_intersector_moeller.h" +#include "quad_intersector_pluecker.h" + +namespace embree +{ + namespace isa + { + /*! Intersects M quads with 1 ray */ + template + struct QuadMvIntersector1Moeller + { + typedef QuadMv Primitive; + typedef QuadMIntersector1MoellerTrumbore Precalculations; + + /*! Intersect a ray with the M quads and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of M quads. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) + { + return PrimitivePointQuery1::pointQuery(query, context, quad); + } + }; + + /*! Intersects M triangles with K rays. */ + template + struct QuadMvIntersectorKMoeller + { + typedef QuadMv Primitive; + typedef QuadMIntersectorKMoellerTrumbore Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const QuadMv& quad) + { + for (size_t i=0; i::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf p0 = broadcast>(quad.v0,i); + const Vec3vf p1 = broadcast>(quad.v1,i); + const Vec3vf p2 = broadcast>(quad.v2,i); + const Vec3vf p3 = broadcast>(quad.v3,i); + pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM(ray,context,quad.geomID(),quad.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const QuadMv& quad) + { + vbool valid0 = valid_i; + + for (size_t i=0; i::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + const Vec3vf p0 = broadcast>(quad.v0,i); + const Vec3vf p1 = broadcast>(quad.v1,i); + const Vec3vf p2 = broadcast>(quad.v2,i); + const Vec3vf p3 = broadcast>(quad.v3,i); + if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM(valid0,ray,context,quad.geomID(),quad.primID(),i))) + break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const QuadMv& quad) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const QuadMv& quad) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + }; + + /*! Intersects M quads with 1 ray */ + template + struct QuadMvIntersector1Pluecker + { + typedef QuadMv Primitive; + typedef QuadMIntersector1Pluecker Precalculations; + + /*! Intersect a ray with the M quads and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of M quads. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad) + { + return PrimitivePointQuery1::pointQuery(query, context, quad); + } + }; + + /*! Intersects M triangles with K rays. */ + template + struct QuadMvIntersectorKPluecker + { + typedef QuadMv Primitive; + typedef QuadMIntersectorKPluecker Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const QuadMv& quad) + { + for (size_t i=0; i::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf p0 = broadcast>(quad.v0,i); + const Vec3vf p1 = broadcast>(quad.v1,i); + const Vec3vf p2 = broadcast>(quad.v2,i); + const Vec3vf p3 = broadcast>(quad.v3,i); + pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM(ray,context,quad.geomID(),quad.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const QuadMv& quad) + { + vbool valid0 = valid_i; + + for (size_t i=0; i::max_size(); i++) + { + if (!quad.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + const Vec3vf p0 = broadcast>(quad.v0,i); + const Vec3vf p1 = broadcast>(quad.v1,i); + const Vec3vf p2 = broadcast>(quad.v2,i); + const Vec3vf p3 = broadcast>(quad.v3,i); + if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM(valid0,ray,context,quad.geomID(),quad.primID(),i))) + break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const QuadMv& quad) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const QuadMv& quad) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID()); + } + }; + } +} + diff --git a/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h new file mode 100644 index 00000000000..cdf68f486bb --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h @@ -0,0 +1,710 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "curve_intersector_precalculations.h" + + +/* + + This file implements the intersection of a ray with a round linear + curve segment. We define the geometry of such a round linear curve + segment from point p0 with radius r0 to point p1 with radius r1 + using the cone that touches spheres p0/r0 and p1/r1 tangentially + plus the sphere p1/r1. We denote the tangentially touching cone from + p0/r0 to p1/r1 with cone(p0,r0,p1,r1) and the cone plus the ending + sphere with cone_sphere(p0,r0,p1,r1). + + For multiple connected round linear curve segments this construction + yield a proper shape when viewed from the outside. Using the + following CSG we can also handle the interiour in most common cases: + + round_linear_curve(pl,rl,p0,r0,p1,r1,pr,rr) = + cone_sphere(p0,r0,p1,r1) - cone(pl,rl,p0,r0) - cone(p1,r1,pr,rr) + + Thus by subtracting the neighboring cone geometries, we cut away + parts of the center cone_sphere surface which lie inside the + combined curve. This approach works as long as geometry of the + current cone_sphere penetrates into direct neighbor segments only, + and not into segments further away. + + To construct a cone that touches two spheres at p0 and p1 with r0 + and r1, one has to increase the cone radius at r0 and r1 to obtain + larger radii w0 and w1, such that the infinite cone properly touches + the spheres. From the paper "Ray Tracing Generalized Tube + Primitives: Method and Applications" + (https://www.researchgate.net/publication/334378683_Ray_Tracing_Generalized_Tube_Primitives_Method_and_Applications) + one can derive the following equations for these increased + radii: + + sr = 1.0f / sqrt(1-sqr(dr)/sqr(p1-p0)) + w0 = sr*r0 + w1 = sr*r1 + + Further, we want the cone to start where it touches the sphere at p0 + and to end where it touches sphere at p1. Therefore, we need to + construct clipping locations y0 and y1 for the start and end of the + cone. These start and end clipping location of the cone can get + calculated as: + + Y0 = - r0 * (r1-r0) / length(p1-p0) + Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0) + + Where the cone starts a distance Y0 and ends a distance Y1 away of + point p0 along the cone center. The distance between Y1-Y0 can get + calculated as: + + dY = length(p1-p0) - (r1-r0)^2 / length(p1-p0) + + In the code below, Y will always be scaled by length(p1-p0) to + obtain y and you will find the terms r0*(r1-r0) and + (p1-p0)^2-(r1-r0)^2. + + */ + +namespace embree +{ + namespace isa + { + template + struct RoundLineIntersectorHitM + { + __forceinline RoundLineIntersectorHitM() {} + + __forceinline RoundLineIntersectorHitM(const vfloat& u, const vfloat& v, const vfloat& t, const Vec3vf& Ng) + : vu(u), vv(v), vt(t), vNg(Ng) {} + + __forceinline void finalize() {} + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + public: + vfloat vu; + vfloat vv; + vfloat vt; + Vec3vf vNg; + }; + + namespace __roundline_internal + { + template + struct ConeGeometry + { + ConeGeometry (const Vec4vf& a, const Vec4vf& b) + : p0(a.xyz()), p1(b.xyz()), dP(p1-p0), dPdP(dot(dP,dP)), r0(a.w), sqr_r0(sqr(r0)), r1(b.w), dr(r1-r0), drdr(dr*dr), r0dr (r0*dr), g(dPdP - drdr) {} + + /* + + This function tests if a point is accepted by first cone + clipping plane. + + First, we need to project the point onto the line p0->p1: + + Y = (p-p0)*(p1-p0)/length(p1-p0) + + This value y is the distance to the projection point from + p0. The clip distances are calculated as: + + Y0 = - r0 * (r1-r0) / length(p1-p0) + Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0) + + Thus to test if the point p is accepted by the first + clipping plane we need to test Y > Y0 and to test if it + is accepted by the second clipping plane we need to test + Y < Y1. + + By multiplying the calculations with length(p1-p0) these + calculation can get simplied to: + + y = (p-p0)*(p1-p0) + y0 = - r0 * (r1-r0) + y1 = (p1-p0)^2 - r1 * (r1-r0) + + and the test y > y0 and y < y1. + + */ + + __forceinline vbool isClippedByPlane (const vbool& valid_i, const Vec3vf& p) const + { + const Vec3vf p0p = p - p0; + const vfloat y = dot(p0p,dP); + const vfloat cap0 = -r0dr; + const vbool inside_cone = y > cap0; + return valid_i & (p0.x != vfloat(inf)) & (p1.x != vfloat(inf)) & inside_cone; + } + + /* + + This function tests whether a point lies inside the capped cone + tangential to its ending spheres. + + Therefore one has to check if the point is inside the + region defined by the cone clipping planes, which is + performed similar as in the previous function. + + To perform the inside cone test we need to project the + point onto the line p0->p1: + + dP = p1-p0 + Y = (p-p0)*dP/length(dP) + + This value Y is the distance to the projection point from + p0. To obtain a parameter value u going from 0 to 1 along + the line p0->p1 we calculate: + + U = Y/length(dP) + + The radii to use at points p0 and p1 are: + + w0 = sr * r0 + w1 = sr * r1 + dw = w1-w0 + + Using these radii and u one can directly test if the point + lies inside the cone using the formula dP*dP < wy*wy with: + + wy = w0 + u*dw + py = p0 + u*dP - p + + By multiplying the calculations with length(p1-p0) and + inserting the definition of w can obtain simpler equations: + + y = (p-p0)*dP + ry = r0 + y/dP^2 * dr + wy = sr*ry + py = p0 + y/dP^2*dP - p + y0 = - r0 * dr + y1 = dP^2 - r1 * dr + + Thus for the in-cone test we get: + + py^2 < wy^2 + <=> py^2 < sr^2 * ry^2 + <=> py^2 * ( dP^2 - dr^2 ) < dP^2 * ry^2 + + This can further get simplified to: + + (p0-p)^2 * (dP^2 - dr^2) - y^2 < dP^2 * r0^2 + 2.0f*r0*dr*y; + + */ + + __forceinline vbool isInsideCappedCone (const vbool& valid_i, const Vec3vf& p) const + { + const Vec3vf p0p = p - p0; + const vfloat y = dot(p0p,dP); + const vfloat cap0 = -r0dr+vfloat(ulp); + const vfloat cap1 = -r1*dr + dPdP; + + vbool inside_cone = valid_i & (p0.x != vfloat(inf)) & (p1.x != vfloat(inf)); + inside_cone &= y > cap0; // start clipping plane + inside_cone &= y < cap1; // end clipping plane + inside_cone &= sqr(p0p)*g - sqr(y) < dPdP * sqr_r0 + 2.0f*r0dr*y; // in cone test + return inside_cone; + } + + protected: + Vec3vf p0; + Vec3vf p1; + Vec3vf dP; + vfloat dPdP; + vfloat r0; + vfloat sqr_r0; + vfloat r1; + vfloat dr; + vfloat drdr; + vfloat r0dr; + vfloat g; + }; + + template + struct ConeGeometryIntersector : public ConeGeometry + { + using ConeGeometry::p0; + using ConeGeometry::p1; + using ConeGeometry::dP; + using ConeGeometry::dPdP; + using ConeGeometry::r0; + using ConeGeometry::sqr_r0; + using ConeGeometry::r1; + using ConeGeometry::dr; + using ConeGeometry::r0dr; + using ConeGeometry::g; + + ConeGeometryIntersector (const Vec3vf& ray_org, const Vec3vf& ray_dir, const vfloat& dOdO, const vfloat& rcp_dOdO, const Vec4vf& a, const Vec4vf& b) + : ConeGeometry(a,b), org(ray_org), O(ray_org-p0), dO(ray_dir), dOdO(dOdO), rcp_dOdO(rcp_dOdO), OdP(dot(dP,O)), dOdP(dot(dP,dO)), yp(OdP + r0dr) {} + + /* + + This function intersects a ray with a cone that touches a + start sphere p0/r0 and end sphere p1/r1. + + To find this ray/cone intersections one could just + calculate radii w0 and w1 as described above and use a + standard ray/cone intersection routine with these + radii. However, it turns out that calculations can get + simplified when deriving a specialized ray/cone + intersection for this special case. We perform + calculations relative to the cone origin p0 and define: + + O = ray_org - p0 + dO = ray_dir + dP = p1-p0 + dr = r1-r0 + dw = w1-w0 + + For some t we can compute the potential hit point h = O + t*dO and + project it onto the cone vector dP to obtain u = (h*dP)/(dP*dP). In + case of an intersection, the squared distance from the hit point + projected onto the cone center line to the hit point should be equal + to the squared cone radius at u: + + (u*dP - h)^2 = (w0 + u*dw)^2 + + Inserting the definition of h, u, w0, and dw into this formula, then + factoring out all terms, and sorting by t^2, t^1, and t^0 terms + yields a quadratic equation to solve. + + Inserting u: + ( (h*dP)*dP/dP^2 - h )^2 = ( w0 + (h*dP)*dw/dP^2 )^2 + + Multiplying by dP^4: + ( (h*dP)*dP - h*dP^2 )^2 = ( w0*dP^2 + (h*dP)*dw )^2 + + Inserting w0 and dw: + ( (h*dP)*dP - h*dP^2 )^2 = ( r0*dP^2 + (h*dP)*dr )^2 / (1-dr^2/dP^2) + ( (h*dP)*dP - h*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (h*dP)*dr )^2 + + Now one can insert the definition of h, factor out, and presort by t: + ( ((O + t*dO)*dP)*dP - (O + t*dO)*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + ((O + t*dO)*dP)*dr )^2 + ( (O*dP)*dP-O*dP^2 + t*( (dO*dP)*dP - dO*dP^2 ) )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (O*dP)*dr + t*(dO*dP)*dr )^2 + + Factoring out further and sorting by t^2, t^1 and t^0 yields: + + 0 = t^2 * [ ((dO*dP)*dP - dO-dP^2)^2 * (dP^2 - dr^2) - dP^2*(dO*dP)^2*dr^2 ] + + 2*t^1 * [ ((O*dP)*dP - O*dP^2) * ((dO*dP)*dP - dO*dP^2) * (dP^2 - dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)*(dO*dP)*dr ] + + t^0 * [ ( (O*dP)*dP - O*dP^2)^2 * (dP^2-dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)^2 ] + + This can be simplified to: + + 0 = t^2 * [ (dP^2 - dr^2)*dO^2 - (dO*dP)^2 ] + + 2*t^1 * [ (dP^2 - dr^2)*(O*dO) - (dO*dP)*(O*dP + r0*dr) ] + + t^0 * [ (dP^2 - dr^2)*O^2 - (O*dP)^2 - r0^2*dP^2 - 2.0f*r0*dr*(O*dP) ] + + Solving this quadratic equation yields the values for t at which the + ray intersects the cone. + + */ + + __forceinline bool intersectCone(vbool& valid, vfloat& lower, vfloat& upper) + { + /* return no hit by default */ + lower = pos_inf; + upper = neg_inf; + + /* compute quadratic equation A*t^2 + B*t + C = 0 */ + const vfloat OO = dot(O,O); + const vfloat OdO = dot(dO,O); + const vfloat A = g * dOdO - sqr(dOdP); + const vfloat B = 2.0f * (g*OdO - dOdP*yp); + const vfloat C = g*OO - sqr(OdP) - sqr_r0*dPdP - 2.0f*r0dr*OdP; + + /* we miss the cone if determinant is smaller than zero */ + const vfloat D = B*B - 4.0f*A*C; + valid &= (D >= 0.0f & g > 0.0f); // if g <= 0 then the cone is inside a sphere end + + /* When rays are parallel to the cone surface, then the + * ray may be inside or outside the cone. We just assume a + * miss in that case, which is fine as rays inside the + * cone would anyway hit the ending spheres in that + * case. */ + valid &= abs(A) > min_rcp_input; + if (unlikely(none(valid))) { + return false; + } + + /* compute distance to front and back hit */ + const vfloat Q = sqrt(D); + const vfloat rcp_2A = rcp(2.0f*A); + t_cone_front = (-B-Q)*rcp_2A; + y_cone_front = yp + t_cone_front*dOdP; + lower = select( (y_cone_front > -(float)ulp) & (y_cone_front <= g) & (g > 0.0f), t_cone_front, vfloat(pos_inf)); +#if !defined (EMBREE_BACKFACE_CULLING_CURVES) + t_cone_back = (-B+Q)*rcp_2A; + y_cone_back = yp + t_cone_back *dOdP; + upper = select( (y_cone_back > -(float)ulp) & (y_cone_back <= g) & (g > 0.0f), t_cone_back , vfloat(neg_inf)); +#endif + return true; + } + + /* + This function intersects the ray with the end sphere at + p1. We already clip away hits that are inside the + neighboring cone segment. + + */ + + __forceinline void intersectEndSphere(vbool& valid, + const ConeGeometry& coneR, + vfloat& lower, vfloat& upper) + { + /* calculate front and back hit with end sphere */ + const Vec3vf O1 = org - p1; + const vfloat O1dO = dot(O1,dO); + const vfloat h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r1)); + const vfloat rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat(neg_inf) ); + + /* clip away front hit if it is inside next cone segment */ + t_sph1_front = (-O1dO - rhs1)*rcp_dOdO; + const Vec3vf hit_front = org + t_sph1_front*dO; + vbool valid_sph1_front = h2 >= 0.0f & yp + t_sph1_front*dOdP > g & !coneR.isClippedByPlane (valid, hit_front); + lower = select(valid_sph1_front, t_sph1_front, vfloat(pos_inf)); + +#if !defined(EMBREE_BACKFACE_CULLING_CURVES) + /* clip away back hit if it is inside next cone segment */ + t_sph1_back = (-O1dO + rhs1)*rcp_dOdO; + const Vec3vf hit_back = org + t_sph1_back*dO; + vbool valid_sph1_back = h2 >= 0.0f & yp + t_sph1_back*dOdP > g & !coneR.isClippedByPlane (valid, hit_back); + upper = select(valid_sph1_back, t_sph1_back, vfloat(neg_inf)); +#else + upper = vfloat(neg_inf); +#endif + } + + __forceinline void intersectBeginSphere(const vbool& valid, + vfloat& lower, vfloat& upper) + { + /* calculate front and back hit with end sphere */ + const Vec3vf O1 = org - p0; + const vfloat O1dO = dot(O1,dO); + const vfloat h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r0)); + const vfloat rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat(neg_inf) ); + + /* clip away front hit if it is inside next cone segment */ + t_sph0_front = (-O1dO - rhs1)*rcp_dOdO; + vbool valid_sph1_front = valid & h2 >= 0.0f & yp + t_sph0_front*dOdP < 0; + lower = select(valid_sph1_front, t_sph0_front, vfloat(pos_inf)); + +#if !defined(EMBREE_BACKFACE_CULLING_CURVES) + /* clip away back hit if it is inside next cone segment */ + t_sph0_back = (-O1dO + rhs1)*rcp_dOdO; + vbool valid_sph1_back = valid & h2 >= 0.0f & yp + t_sph0_back*dOdP < 0; + upper = select(valid_sph1_back, t_sph0_back, vfloat(neg_inf)); +#else + upper = vfloat(neg_inf); +#endif + } + + /* + + This function calculates the geometry normal of some cone hit. + + For a given hit point h (relative to p0) with a cone + starting at p0 with radius w0 and ending at p1 with + radius w1 one normally calculates the geometry normal by + first calculating the parmetric u hit location along the + cone: + + u = dot(h,dP)/dP^2 + + Using this value one can now directly calculate the + geometry normal by bending the connection vector (h-u*dP) + from hit to projected hit with some cone dependent value + dw/sqrt(dP^2) * normalize(dP): + + Ng = normalize(h-u*dP) - dw/length(dP) * normalize(dP) + + The length of the vector (h-u*dP) can also get calculated + by interpolating the radii as w0+u*dw which yields: + + Ng = (h-u*dP)/(w0+u*dw) - dw/dP^2 * dP + + Multiplying with (w0+u*dw) yield a scaled Ng': + + Ng' = (h-u*dP) - (w0+u*dw)*dw/dP^2*dP + + Inserting the definition of w0 and dw and refactoring + yield a furhter scaled Ng'': + + Ng'' = (dP^2 - dr^2) (h-q) - (r0+u*dr)*dr*dP + + Now inserting the definition of u gives and multiplying + with the denominator yields: + + Ng''' = (dP^2-dr^2)*(dP^2*h-dot(h,dP)*dP) - (dP^2*r0+dot(h,dP)*dr)*dr*dP + + Factoring out, cancelling terms, dividing by dP^2, and + factoring again yields finally: + + Ng'''' = (dP^2-dr^2)*h - dP*(dot(h,dP) + r0*dr) + + */ + + __forceinline Vec3vf Ng_cone(const vbool& front_hit) const + { +#if !defined(EMBREE_BACKFACE_CULLING_CURVES) + const vfloat y = select(front_hit, y_cone_front, y_cone_back); + const vfloat t = select(front_hit, t_cone_front, t_cone_back); + const Vec3vf h = O + t*dO; + return g*h-dP*y; +#else + const Vec3vf h = O + t_cone_front*dO; + return g*h-dP*y_cone_front; +#endif + } + + /* compute geometry normal of sphere hit as the difference + * vector from hit point to sphere center */ + + __forceinline Vec3vf Ng_sphere1(const vbool& front_hit) const + { +#if !defined(EMBREE_BACKFACE_CULLING_CURVES) + const vfloat t_sph1 = select(front_hit, t_sph1_front, t_sph1_back); + return org+t_sph1*dO-p1; +#else + return org+t_sph1_front*dO-p1; +#endif + } + + __forceinline Vec3vf Ng_sphere0(const vbool& front_hit) const + { +#if !defined(EMBREE_BACKFACE_CULLING_CURVES) + const vfloat t_sph0 = select(front_hit, t_sph0_front, t_sph0_back); + return org+t_sph0*dO-p0; +#else + return org+t_sph0_front*dO-p0; +#endif + } + + /* + This function calculates the u coordinate of a + hit. Therefore we use the hit distance y (which is zero + at the first cone clipping plane) and divide by distance + g between the clipping planes. + + */ + + __forceinline vfloat u_cone(const vbool& front_hit) const + { +#if !defined(EMBREE_BACKFACE_CULLING_CURVES) + const vfloat y = select(front_hit, y_cone_front, y_cone_back); + return clamp(y*rcp(g)); +#else + return clamp(y_cone_front*rcp(g)); +#endif + } + + private: + Vec3vf org; + Vec3vf O; + Vec3vf dO; + vfloat dOdO; + vfloat rcp_dOdO; + vfloat OdP; + vfloat dOdP; + + /* for ray/cone intersection */ + private: + vfloat yp; + vfloat y_cone_front; + vfloat t_cone_front; +#if !defined (EMBREE_BACKFACE_CULLING_CURVES) + vfloat y_cone_back; + vfloat t_cone_back; +#endif + + /* for ray/sphere intersection */ + private: + vfloat t_sph1_front; + vfloat t_sph0_front; +#if !defined (EMBREE_BACKFACE_CULLING_CURVES) + vfloat t_sph1_back; + vfloat t_sph0_back; +#endif + }; + + + template + static __forceinline bool intersectConeSphere(const vbool& valid_i, + const Vec3vf& ray_org_in, const Vec3vf& ray_dir, + const vfloat& ray_tnear, const ray_tfar_func& ray_tfar, + const Vec4vf& v0, const Vec4vf& v1, + const Vec4vf& vL, const Vec4vf& vR, + const Epilog& epilog) + { + vbool valid = valid_i; + + /* move ray origin closer to make calculations numerically stable */ + const vfloat dOdO = sqr(ray_dir); + const vfloat rcp_dOdO = rcp(dOdO); + const Vec3vf center = vfloat(0.5f)*(v0.xyz()+v1.xyz()); + const vfloat dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO; + const Vec3vf ray_org = ray_org_in + dt*ray_dir; + + /* intersect with cone from v0 to v1 */ + vfloat t_cone_lower, t_cone_upper; + ConeGeometryIntersector cone (ray_org, ray_dir, dOdO, rcp_dOdO, v0, v1); + vbool validCone = valid; + cone.intersectCone(validCone, t_cone_lower, t_cone_upper); + + valid &= (validCone | (cone.g <= 0.0f)); // if cone is entirely in sphere end - check sphere + if (unlikely(none(valid))) + return false; + + /* cone hits inside the neighboring capped cones are inside the geometry and thus ignored */ + const ConeGeometry coneL (v0, vL); + const ConeGeometry coneR (v1, vR); +#if !defined(EMBREE_BACKFACE_CULLING_CURVES) + const Vec3vf hit_lower = ray_org + t_cone_lower*ray_dir; + const Vec3vf hit_upper = ray_org + t_cone_upper*ray_dir; + t_cone_lower = select (!coneL.isInsideCappedCone (validCone, hit_lower) & !coneR.isInsideCappedCone (validCone, hit_lower), t_cone_lower, vfloat(pos_inf)); + t_cone_upper = select (!coneL.isInsideCappedCone (validCone, hit_upper) & !coneR.isInsideCappedCone (validCone, hit_upper), t_cone_upper, vfloat(neg_inf)); +#endif + + /* intersect ending sphere */ + vfloat t_sph1_lower, t_sph1_upper; + vfloat t_sph0_lower = vfloat(pos_inf); + vfloat t_sph0_upper = vfloat(neg_inf); + cone.intersectEndSphere(valid, coneR, t_sph1_lower, t_sph1_upper); + + const vbool isBeginPoint = valid & (vL[0] == vfloat(pos_inf)); + if (unlikely(any(isBeginPoint))) { + cone.intersectBeginSphere (isBeginPoint, t_sph0_lower, t_sph0_upper); + } + + /* CSG union of cone and end sphere */ + vfloat t_sph_lower = min(t_sph0_lower, t_sph1_lower); + vfloat t_cone_sphere_lower = min(t_cone_lower, t_sph_lower); +#if !defined (EMBREE_BACKFACE_CULLING_CURVES) + vfloat t_sph_upper = max(t_sph0_upper, t_sph1_upper); + vfloat t_cone_sphere_upper = max(t_cone_upper, t_sph_upper); + + /* filter out hits that are not in tnear/tfar range */ + const vbool valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat(pos_inf); + const vbool valid_upper = valid & ray_tnear <= dt+t_cone_sphere_upper & dt+t_cone_sphere_upper <= ray_tfar() & t_cone_sphere_upper != vfloat(neg_inf); + + /* check if there is a first hit */ + const vbool valid_first = valid_lower | valid_upper; + if (unlikely(none(valid_first))) + return false; + + /* construct first hit */ + const vfloat t_first = select(valid_lower, t_cone_sphere_lower, t_cone_sphere_upper); + const vbool cone_hit_first = t_first == t_cone_lower | t_first == t_cone_upper; + const vbool sph0_hit_first = t_first == t_sph0_lower | t_first == t_sph0_upper; + const Vec3vf Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower))); + const vfloat u_first = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat(zero), vfloat(one))); + + /* invoke intersection filter for first hit */ + RoundLineIntersectorHitM hit(u_first,zero,dt+t_first,Ng_first); + const bool is_hit_first = epilog(valid_first, hit); + + /* check for possible second hits before potentially accepted hit */ + const vfloat t_second = t_cone_sphere_upper; + const vbool valid_second = valid_lower & valid_upper & (dt+t_cone_sphere_upper <= ray_tfar()); + if (unlikely(none(valid_second))) + return is_hit_first; + + /* invoke intersection filter for second hit */ + const vbool cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper; + const vbool sph0_hit_second = t_second == t_sph0_lower | t_second == t_sph0_upper; + const Vec3vf Ng_second = select(cone_hit_second, cone.Ng_cone(false), select (sph0_hit_second, cone.Ng_sphere0(false), cone.Ng_sphere1(false))); + const vfloat u_second = select(cone_hit_second, cone.u_cone(false), select (sph0_hit_second, vfloat(zero), vfloat(one))); + + hit = RoundLineIntersectorHitM(u_second,zero,dt+t_second,Ng_second); + const bool is_hit_second = epilog(valid_second, hit); + + return is_hit_first | is_hit_second; +#else + /* filter out hits that are not in tnear/tfar range */ + const vbool valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat(pos_inf); + + /* check if there is a valid hit */ + if (unlikely(none(valid_lower))) + return false; + + /* construct first hit */ + const vbool cone_hit_first = t_cone_sphere_lower == t_cone_lower | t_cone_sphere_lower == t_cone_upper; + const vbool sph0_hit_first = t_cone_sphere_lower == t_sph0_lower | t_cone_sphere_lower == t_sph0_upper; + const Vec3vf Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower))); + const vfloat u_first = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat(zero), vfloat(one))); + + /* invoke intersection filter for first hit */ + RoundLineIntersectorHitM hit(u_first,zero,dt+t_cone_sphere_lower,Ng_first); + const bool is_hit_first = epilog(valid_lower, hit); + + return is_hit_first; +#endif + } + + } // end namespace __roundline_internal + + template + struct RoundLinearCurveIntersector1 + { + typedef CurvePrecalculations1 Precalculations; + + struct ray_tfar { + Ray& ray; + __forceinline ray_tfar(Ray& ray) : ray(ray) {} + __forceinline vfloat operator() () const { return ray.tfar; }; + }; + + template + static __forceinline bool intersect(const vbool& valid_i, + Ray& ray, + IntersectContext* context, + const LineSegments* geom, + const Precalculations& pre, + const Vec4vf& v0i, const Vec4vf& v1i, + const Vec4vf& vLi, const Vec4vf& vRi, + const Epilog& epilog) + { + const Vec3vf ray_org(ray.org.x, ray.org.y, ray.org.z); + const Vec3vf ray_dir(ray.dir.x, ray.dir.y, ray.dir.z); + const vfloat ray_tnear(ray.tnear()); + const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); + const Vec4vf v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i); + const Vec4vf vL = enlargeRadiusToMinWidth(context,geom,ray_org,vLi); + const Vec4vf vR = enlargeRadiusToMinWidth(context,geom,ray_org,vRi); + return __roundline_internal::intersectConeSphere(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray),v0,v1,vL,vR,epilog); + } + }; + + template + struct RoundLinearCurveIntersectorK + { + typedef CurvePrecalculationsK Precalculations; + + struct ray_tfar { + RayK& ray; + size_t k; + __forceinline ray_tfar(RayK& ray, size_t k) : ray(ray), k(k) {} + __forceinline vfloat operator() () const { return ray.tfar[k]; }; + }; + + template + static __forceinline bool intersect(const vbool& valid_i, + RayK& ray, size_t k, + IntersectContext* context, + const LineSegments* geom, + const Precalculations& pre, + const Vec4vf& v0i, const Vec4vf& v1i, + const Vec4vf& vLi, const Vec4vf& vRi, + const Epilog& epilog) + { + const Vec3vf ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const Vec3vf ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); + const vfloat ray_tnear = ray.tnear()[k]; + const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); + const Vec4vf v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i); + const Vec4vf vL = enlargeRadiusToMinWidth(context,geom,ray_org,vLi); + const Vec4vf vR = enlargeRadiusToMinWidth(context,geom,ray_org,vRi); + return __roundline_internal::intersectConeSphere(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,vL,vR,epilog); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h new file mode 100644 index 00000000000..079817335eb --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h @@ -0,0 +1,136 @@ +// ======================================================================== // +// Copyright 2009-2020 Intel Corporation // +// // +// Licensed under the Apache License, Version 2.0 (the "License"); // +// you may not use this file except in compliance with the License. // +// You may obtain a copy of the License at // +// // +// http://www.apache.org/licenses/LICENSE-2.0 // +// // +// Unless required by applicable law or agreed to in writing, software // +// distributed under the License is distributed on an "AS IS" BASIS, // +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // +// See the License for the specific language governing permissions and // +// limitations under the License. // +// ======================================================================== // + +#pragma once + +#include "roundline_intersector.h" +#include "intersector_epilog.h" + +namespace embree +{ + namespace isa + { + template + struct RoundLinearCurveMiIntersector1 + { + typedef LineMi Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom); + const vbool valid = line.template valid(); + RoundLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom); + const vbool valid = line.template valid(); + return RoundLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) + { + return PrimitivePointQuery1::pointQuery(query, context, line); + } + }; + + template + struct RoundLinearCurveMiMBIntersector1 + { + typedef LineMi Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()); + const vbool valid = line.template valid(); + RoundLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()); + const vbool valid = line.template valid(); + return RoundLinearCurveIntersector1::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM(ray,context,line.geomID(),line.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line) + { + return PrimitivePointQuery1::pointQuery(query, context, line); + } + }; + + template + struct RoundLinearCurveMiIntersectorK + { + typedef LineMi Primitive; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom); + const vbool valid = line.template valid(); + RoundLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM(ray,k,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom); + const vbool valid = line.template valid(); + return RoundLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM(ray,k,context,line.geomID(),line.primID())); + } + }; + + template + struct RoundLinearCurveMiMBIntersectorK + { + typedef LineMi Primitive; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(normal.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]); + const vbool valid = line.template valid(); + RoundLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM(ray,k,context,line.geomID(),line.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& line) + { + STAT3(shadow.trav_prims,1,1,1); + const LineSegments* geom = context->scene->get(line.geomID()); + Vec4vf v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]); + const vbool valid = line.template valid(); + return RoundLinearCurveIntersectorK::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM(ray,k,context,line.geomID(),line.primID())); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h new file mode 100644 index 00000000000..3ab90c29ef7 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h @@ -0,0 +1,183 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "../common/scene_points.h" +#include "curve_intersector_precalculations.h" + +namespace embree +{ + namespace isa + { + template + struct SphereIntersectorHitM + { + __forceinline SphereIntersectorHitM() {} + + __forceinline SphereIntersectorHitM(const vfloat& t, const Vec3vf& Ng) + : vt(t), vNg(Ng) {} + + __forceinline void finalize() {} + + __forceinline Vec2f uv(const size_t i) const { + return Vec2f(0.0f, 0.0f); + } + __forceinline float t(const size_t i) const { + return vt[i]; + } + __forceinline Vec3fa Ng(const size_t i) const { + return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]); + } + + public: + vfloat vt; + Vec3vf vNg; + }; + + template + struct SphereIntersector1 + { + typedef CurvePrecalculations1 Precalculations; + + template + static __forceinline bool intersect( + const vbool& valid_i, Ray& ray, + const Precalculations& pre, const Vec4vf& v0, const Epilog& epilog) + { + vbool valid = valid_i; + + const vfloat rd2 = rcp(dot(ray.dir, ray.dir)); + const Vec3vf ray_org(ray.org.x, ray.org.y, ray.org.z); + const Vec3vf ray_dir(ray.dir.x, ray.dir.y, ray.dir.z); + const Vec3vf center = v0.xyz(); + const vfloat radius = v0.w; + + const Vec3vf c0 = center - ray_org; + const vfloat projC0 = dot(c0, ray_dir) * rd2; + const Vec3vf perp = c0 - projC0 * ray_dir; + const vfloat l2 = dot(perp, perp); + const vfloat r2 = radius * radius; + valid &= (l2 <= r2); + if (unlikely(none(valid))) + return false; + + const vfloat td = sqrt((r2 - l2) * rd2); + const vfloat t_front = projC0 - td; + const vfloat t_back = projC0 + td; + + const vbool valid_front = valid & (ray.tnear() <= t_front) & (t_front <= ray.tfar); + const vbool valid_back = valid & (ray.tnear() <= t_back ) & (t_back <= ray.tfar); + + /* check if there is a first hit */ + const vbool valid_first = valid_front | valid_back; + if (unlikely(none(valid_first))) + return false; + + /* construct first hit */ + const vfloat td_front = -td; + const vfloat td_back = +td; + const vfloat t_first = select(valid_front, t_front, t_back); + const Vec3vf Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp; + SphereIntersectorHitM hit(t_first, Ng_first); + + /* invoke intersection filter for first hit */ + const bool is_hit_first = epilog(valid_first, hit); + + /* check for possible second hits before potentially accepted hit */ + const vfloat t_second = t_back; + const vbool valid_second = valid_front & valid_back & (t_second <= ray.tfar); + if (unlikely(none(valid_second))) + return is_hit_first; + + /* invoke intersection filter for second hit */ + const Vec3vf Ng_second = td_back * ray_dir - perp; + hit = SphereIntersectorHitM (t_second, Ng_second); + const bool is_hit_second = epilog(valid_second, hit); + + return is_hit_first | is_hit_second; + } + + template + static __forceinline bool intersect( + const vbool& valid_i, Ray& ray, IntersectContext* context, const Points* geom, + const Precalculations& pre, const Vec4vf& v0i, const Epilog& epilog) + { + const Vec3vf ray_org(ray.org.x, ray.org.y, ray.org.z); + const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); + return intersect(valid_i,ray,pre,v0,epilog); + } + }; + + template + struct SphereIntersectorK + { + typedef CurvePrecalculationsK Precalculations; + + template + static __forceinline bool intersect(const vbool& valid_i, + RayK& ray, size_t k, + IntersectContext* context, + const Points* geom, + const Precalculations& pre, + const Vec4vf& v0i, + const Epilog& epilog) + { + vbool valid = valid_i; + + const Vec3vf ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]); + const Vec3vf ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]); + const vfloat rd2 = rcp(dot(ray_dir, ray_dir)); + + const Vec4vf v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i); + const Vec3vf center = v0.xyz(); + const vfloat radius = v0.w; + + const Vec3vf c0 = center - ray_org; + const vfloat projC0 = dot(c0, ray_dir) * rd2; + const Vec3vf perp = c0 - projC0 * ray_dir; + const vfloat l2 = dot(perp, perp); + const vfloat r2 = radius * radius; + valid &= (l2 <= r2); + if (unlikely(none(valid))) + return false; + + const vfloat td = sqrt((r2 - l2) * rd2); + const vfloat t_front = projC0 - td; + const vfloat t_back = projC0 + td; + + const vbool valid_front = valid & (ray.tnear()[k] <= t_front) & (t_front <= ray.tfar[k]); + const vbool valid_back = valid & (ray.tnear()[k] <= t_back ) & (t_back <= ray.tfar[k]); + + /* check if there is a first hit */ + const vbool valid_first = valid_front | valid_back; + if (unlikely(none(valid_first))) + return false; + + /* construct first hit */ + const vfloat td_front = -td; + const vfloat td_back = +td; + const vfloat t_first = select(valid_front, t_front, t_back); + const Vec3vf Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp; + SphereIntersectorHitM hit(t_first, Ng_first); + + /* invoke intersection filter for first hit */ + const bool is_hit_first = epilog(valid_first, hit); + + /* check for possible second hits before potentially accepted hit */ + const vfloat t_second = t_back; + const vbool valid_second = valid_front & valid_back & (t_second <= ray.tfar[k]); + if (unlikely(none(valid_second))) + return is_hit_first; + + /* invoke intersection filter for second hit */ + const Vec3vf Ng_second = td_back * ray_dir - perp; + hit = SphereIntersectorHitM (t_second, Ng_second); + const bool is_hit_second = epilog(valid_second, hit); + + return is_hit_first | is_hit_second; + } + }; + } // namespace isa +} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h new file mode 100644 index 00000000000..11468476022 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h @@ -0,0 +1,156 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "intersector_epilog.h" +#include "pointi.h" +#include "sphere_intersector.h" + +namespace embree +{ + namespace isa + { + template + struct SphereMiIntersector1 + { + typedef PointMi Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, + RayHit& ray, + IntersectContext* context, + const Primitive& sphere) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(sphere.geomID()); + Vec4vf v0; sphere.gather(v0, geom); + const vbool valid = sphere.template valid(); + SphereIntersector1::intersect( + valid, ray, context, geom, pre, v0, Intersect1EpilogM(ray, context, sphere.geomID(), sphere.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, + Ray& ray, + IntersectContext* context, + const Primitive& sphere) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(sphere.geomID()); + Vec4vf v0; sphere.gather(v0, geom); + const vbool valid = sphere.template valid(); + return SphereIntersector1::intersect( + valid, ray, context, geom, pre, v0, Occluded1EpilogM(ray, context, sphere.geomID(), sphere.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, + PointQueryContext* context, + const Primitive& sphere) + { + return PrimitivePointQuery1::pointQuery(query, context, sphere); + } + }; + + template + struct SphereMiMBIntersector1 + { + typedef PointMi Primitive; + typedef CurvePrecalculations1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, + RayHit& ray, + IntersectContext* context, + const Primitive& sphere) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(sphere.geomID()); + Vec4vf v0; sphere.gather(v0, geom, ray.time()); + const vbool valid = sphere.template valid(); + SphereIntersector1::intersect( + valid, ray, context, geom, pre, v0, Intersect1EpilogM(ray, context, sphere.geomID(), sphere.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, + Ray& ray, + IntersectContext* context, + const Primitive& sphere) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(sphere.geomID()); + Vec4vf v0; sphere.gather(v0, geom, ray.time()); + const vbool valid = sphere.template valid(); + return SphereIntersector1::intersect( + valid, ray, context, geom, pre, v0, Occluded1EpilogM(ray, context, sphere.geomID(), sphere.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, + PointQueryContext* context, + const Primitive& sphere) + { + return PrimitivePointQuery1::pointQuery(query, context, sphere); + } + }; + + template + struct SphereMiIntersectorK + { + typedef PointMi Primitive; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline void intersect( + const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& sphere) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(sphere.geomID()); + Vec4vf v0; sphere.gather(v0, geom); + const vbool valid = sphere.template valid(); + SphereIntersectorK::intersect( + valid, ray, k, context, geom, pre, v0, + Intersect1KEpilogM(ray, k, context, sphere.geomID(), sphere.primID())); + } + + static __forceinline bool occluded( + const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& sphere) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(sphere.geomID()); + Vec4vf v0; sphere.gather(v0, geom); + const vbool valid = sphere.template valid(); + return SphereIntersectorK::intersect( + valid, ray, k, context, geom, pre, v0, + Occluded1KEpilogM(ray, k, context, sphere.geomID(), sphere.primID())); + } + }; + + template + struct SphereMiMBIntersectorK + { + typedef PointMi Primitive; + typedef CurvePrecalculationsK Precalculations; + + static __forceinline void intersect( + const Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& sphere) + { + STAT3(normal.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(sphere.geomID()); + Vec4vf v0; sphere.gather(v0, geom, ray.time()[k]); + const vbool valid = sphere.template valid(); + SphereIntersectorK::intersect( + valid, ray, k, context, geom, pre, v0, + Intersect1KEpilogM(ray, k, context, sphere.geomID(), sphere.primID())); + } + + static __forceinline bool occluded( + const Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& sphere) + { + STAT3(shadow.trav_prims, 1, 1, 1); + const Points* geom = context->scene->get(sphere.geomID()); + Vec4vf v0; sphere.gather(v0, geom, ray.time()[k]); + const vbool valid = sphere.template valid(); + return SphereIntersectorK::intersect( + valid, ray, k, context, geom, pre, v0, + Occluded1KEpilogM(ray, k, context, sphere.geomID(), sphere.primID())); + } + }; + } // namespace isa +} // namespace embree diff --git a/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h new file mode 100644 index 00000000000..94ad46ad87a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h @@ -0,0 +1,38 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../geometry/primitive.h" +#include "../subdiv/subdivpatch1base.h" + +namespace embree +{ + + struct __aligned(64) SubdivPatch1 : public SubdivPatch1Base + { + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + + static Type type; + + public: + + /*! constructor for cached subdiv patch */ + SubdivPatch1 (const unsigned int gID, + const unsigned int pID, + const unsigned int subPatch, + const SubdivMesh *const mesh, + const size_t time, + const Vec2f uv[4], + const float edge_level[4], + const int subdiv[4], + const int simd_width) + : SubdivPatch1Base(gID,pID,subPatch,mesh,time,uv,edge_level,subdiv,simd_width) {} + }; +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h new file mode 100644 index 00000000000..74ec1de2586 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h @@ -0,0 +1,237 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "subdivpatch1.h" +#include "grid_soa.h" +#include "grid_soa_intersector1.h" +#include "grid_soa_intersector_packet.h" +#include "../common/ray.h" + +namespace embree +{ + namespace isa + { + template + class SubdivPatch1Precalculations : public T + { + public: + __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr) + : T(ray,ptr) {} + }; + + template + class SubdivPatch1PrecalculationsK : public T + { + public: + __forceinline SubdivPatch1PrecalculationsK (const vbool& valid, RayK& ray) + : T(valid,ray) {} + }; + + class SubdivPatch1Intersector1 + { + public: + typedef GridSOA Primitive; + typedef SubdivPatch1Precalculations Precalculations; + + static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + lazy_node = prim->root(0); + pre.grid = (Primitive*)prim; + return false; + } + + /*! Intersect a ray with the primitive. */ + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) + { + if (likely(ty == 0)) GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node); + else processLazyNode(pre,context,prim,lazy_node); + } + + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) { + intersect(This,pre,ray,context,prim,ty,tray,lazy_node); + } + + /*! Test if the ray is occluded by the primitive */ + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) + { + if (likely(ty == 0)) return GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node); + else return processLazyNode(pre,context,prim,lazy_node); + } + + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) { + return occluded(This,pre,ray,context,prim,ty,tray,lazy_node); + } + + template + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery &tquery, size_t& lazy_node) + { + // TODO: PointQuery implement + assert(false && "not implemented"); + return false; + } + + template + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery &tquery, size_t& lazy_node) { + return pointQuery(This,query,context,prim,ty,tquery,lazy_node); + } + }; + + class SubdivPatch1MBIntersector1 + { + public: + typedef SubdivPatch1 Primitive; + typedef GridSOAMBIntersector1::Precalculations Precalculations; + + static __forceinline bool processLazyNode(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node) + { + Primitive* prim = (Primitive*) prim_i; + GridSOA* grid = nullptr; + grid = (GridSOA*) prim->root_ref.get(); + pre.itime = getTimeSegment(ray.time(), float(grid->time_steps-1), pre.ftime); + lazy_node = grid->root(pre.itime); + pre.grid = grid; + return false; + } + + /*! Intersect a ray with the primitive. */ + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) + { + if (likely(ty == 0)) GridSOAMBIntersector1::intersect(pre,ray,context,prim,lazy_node); + else processLazyNode(pre,ray,context,prim,lazy_node); + } + + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) { + intersect(This,pre,ray,context,prim,ty,tray,lazy_node); + } + + /*! Test if the ray is occluded by the primitive */ + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) + { + if (likely(ty == 0)) return GridSOAMBIntersector1::occluded(pre,ray,context,prim,lazy_node); + else return processLazyNode(pre,ray,context,prim,lazy_node); + } + + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) { + return occluded(This,pre,ray,context,prim,ty,tray,lazy_node); + } + + template + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery &tquery, size_t& lazy_node) + { + // TODO: PointQuery implement + assert(false && "not implemented"); + return false; + } + + template + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery &tquery, size_t& lazy_node) { + return pointQuery(This,query,context,prim,ty,tquery,lazy_node); + } + }; + + template + struct SubdivPatch1IntersectorK + { + typedef GridSOA Primitive; + typedef SubdivPatch1PrecalculationsK::Precalculations> Precalculations; + + static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node) + { + lazy_node = prim->root(0); + pre.grid = (Primitive*)prim; + return false; + } + + template + static __forceinline void intersect(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK &tray, size_t& lazy_node) + { + if (likely(ty == 0)) GridSOAIntersectorK::intersect(valid,pre,ray,context,prim,lazy_node); + else processLazyNode(pre,context,prim,lazy_node); + } + + template + static __forceinline vbool occluded(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK &tray, size_t& lazy_node) + { + if (likely(ty == 0)) return GridSOAIntersectorK::occluded(valid,pre,ray,context,prim,lazy_node); + else return processLazyNode(pre,context,prim,lazy_node); + } + + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) + { + if (likely(ty == 0)) GridSOAIntersectorK::intersect(pre,ray,k,context,prim,lazy_node); + else processLazyNode(pre,context,prim,lazy_node); + } + + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) + { + if (likely(ty == 0)) return GridSOAIntersectorK::occluded(pre,ray,k,context,prim,lazy_node); + else return processLazyNode(pre,context,prim,lazy_node); + } + }; + + typedef SubdivPatch1IntersectorK<4> SubdivPatch1Intersector4; + typedef SubdivPatch1IntersectorK<8> SubdivPatch1Intersector8; + typedef SubdivPatch1IntersectorK<16> SubdivPatch1Intersector16; + + template + struct SubdivPatch1MBIntersectorK + { + typedef SubdivPatch1 Primitive; + //typedef GridSOAMBIntersectorK::Precalculations Precalculations; + typedef SubdivPatch1PrecalculationsK::Precalculations> Precalculations; + + static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node) + { + Primitive* prim = (Primitive*) prim_i; + GridSOA* grid = (GridSOA*) prim->root_ref.get(); + lazy_node = grid->troot; + pre.grid = grid; + return false; + } + + template + static __forceinline void intersect(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK &tray, size_t& lazy_node) + { + if (likely(ty == 0)) GridSOAMBIntersectorK::intersect(valid,pre,ray,context,prim,lazy_node); + else processLazyNode(pre,context,prim,lazy_node); + } + + template + static __forceinline vbool occluded(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK &tray, size_t& lazy_node) + { + if (likely(ty == 0)) return GridSOAMBIntersectorK::occluded(valid,pre,ray,context,prim,lazy_node); + else return processLazyNode(pre,context,prim,lazy_node); + } + + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) + { + if (likely(ty == 0)) GridSOAMBIntersectorK::intersect(pre,ray,k,context,prim,lazy_node); + else processLazyNode(pre,context,prim,lazy_node); + } + + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay &tray, size_t& lazy_node) + { + if (likely(ty == 0)) return GridSOAMBIntersectorK::occluded(pre,ray,k,context,prim,lazy_node); + else return processLazyNode(pre,context,prim,lazy_node); + } + }; + + typedef SubdivPatch1MBIntersectorK<4> SubdivPatch1MBIntersector4; + typedef SubdivPatch1MBIntersectorK<8> SubdivPatch1MBIntersector8; + typedef SubdivPatch1MBIntersectorK<16> SubdivPatch1MBIntersector16; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid.h new file mode 100644 index 00000000000..39fa6fb0f04 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid.h @@ -0,0 +1,517 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/ray.h" +#include "../common/scene_grid_mesh.h" +#include "../bvh/bvh.h" + +namespace embree +{ + /* Stores M quads from an indexed face set */ + struct SubGrid + { + /* Virtual interface to query information about the quad type */ + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* primitive supports multiple time segments */ + static const bool singleTimeSegment = false; + + /* Returns maximum number of stored quads */ + static __forceinline size_t max_size() { return 1; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + public: + + /* Default constructor */ + __forceinline SubGrid() { } + + /* Construction from vertices and IDs */ + __forceinline SubGrid(const unsigned int x, + const unsigned int y, + const unsigned int geomID, + const unsigned int primID) + : _x(x), _y(y), _geomID(geomID), _primID(primID) + { + } + + __forceinline bool invalid3x3X() const { return (unsigned int)_x & (1<<15); } + __forceinline bool invalid3x3Y() const { return (unsigned int)_y & (1<<15); } + + /* Gather the quads */ + __forceinline void gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + Vec3vf4& p3, + const GridMesh* const mesh, + const GridMesh::Grid &g) const + { + /* first quad always valid */ + const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset; + const size_t vtxID01 = vtxID00 + 1; + const vfloat4 vtx00 = vfloat4::loadu(mesh->vertexPtr(vtxID00)); + const vfloat4 vtx01 = vfloat4::loadu(mesh->vertexPtr(vtxID01)); + const size_t vtxID10 = vtxID00 + g.lineVtxOffset; + const size_t vtxID11 = vtxID01 + g.lineVtxOffset; + const vfloat4 vtx10 = vfloat4::loadu(mesh->vertexPtr(vtxID10)); + const vfloat4 vtx11 = vfloat4::loadu(mesh->vertexPtr(vtxID11)); + + /* deltaX => vtx02, vtx12 */ + const size_t deltaX = invalid3x3X() ? 0 : 1; + const size_t vtxID02 = vtxID01 + deltaX; + const vfloat4 vtx02 = vfloat4::loadu(mesh->vertexPtr(vtxID02)); + const size_t vtxID12 = vtxID11 + deltaX; + const vfloat4 vtx12 = vfloat4::loadu(mesh->vertexPtr(vtxID12)); + + /* deltaY => vtx20, vtx21 */ + const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset; + const size_t vtxID20 = vtxID10 + deltaY; + const size_t vtxID21 = vtxID11 + deltaY; + const vfloat4 vtx20 = vfloat4::loadu(mesh->vertexPtr(vtxID20)); + const vfloat4 vtx21 = vfloat4::loadu(mesh->vertexPtr(vtxID21)); + + /* deltaX/deltaY => vtx22 */ + const size_t vtxID22 = vtxID11 + deltaX + deltaY; + const vfloat4 vtx22 = vfloat4::loadu(mesh->vertexPtr(vtxID22)); + + transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z); + transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z); + transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z); + transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z); + } + + template + __forceinline vfloat4 getVertexMB(const GridMesh* const mesh, const size_t offset, const size_t itime, const float ftime) const + { + const T v0 = T::loadu(mesh->vertexPtr(offset,itime+0)); + const T v1 = T::loadu(mesh->vertexPtr(offset,itime+1)); + return lerp(v0,v1,ftime); + } + + /* Gather the quads */ + __forceinline void gatherMB(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + Vec3vf4& p3, + const GridMesh* const mesh, + const GridMesh::Grid &g, + const size_t itime, + const float ftime) const + { + /* first quad always valid */ + const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset; + const size_t vtxID01 = vtxID00 + 1; + const vfloat4 vtx00 = getVertexMB(mesh,vtxID00,itime,ftime); + const vfloat4 vtx01 = getVertexMB(mesh,vtxID01,itime,ftime); + const size_t vtxID10 = vtxID00 + g.lineVtxOffset; + const size_t vtxID11 = vtxID01 + g.lineVtxOffset; + const vfloat4 vtx10 = getVertexMB(mesh,vtxID10,itime,ftime); + const vfloat4 vtx11 = getVertexMB(mesh,vtxID11,itime,ftime); + + /* deltaX => vtx02, vtx12 */ + const size_t deltaX = invalid3x3X() ? 0 : 1; + const size_t vtxID02 = vtxID01 + deltaX; + const vfloat4 vtx02 = getVertexMB(mesh,vtxID02,itime,ftime); + const size_t vtxID12 = vtxID11 + deltaX; + const vfloat4 vtx12 = getVertexMB(mesh,vtxID12,itime,ftime); + + /* deltaY => vtx20, vtx21 */ + const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset; + const size_t vtxID20 = vtxID10 + deltaY; + const size_t vtxID21 = vtxID11 + deltaY; + const vfloat4 vtx20 = getVertexMB(mesh,vtxID20,itime,ftime); + const vfloat4 vtx21 = getVertexMB(mesh,vtxID21,itime,ftime); + + /* deltaX/deltaY => vtx22 */ + const size_t vtxID22 = vtxID11 + deltaX + deltaY; + const vfloat4 vtx22 = getVertexMB(mesh,vtxID22,itime,ftime); + + transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z); + transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z); + transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z); + transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z); + } + + + + /* Gather the quads */ + __forceinline void gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + Vec3vf4& p3, + const Scene *const scene) const + { + const GridMesh* const mesh = scene->get(geomID()); + const GridMesh::Grid &g = mesh->grid(primID()); + gather(p0,p1,p2,p3,mesh,g); + } + + /* Gather the quads in the motion blur case */ + __forceinline void gatherMB(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + Vec3vf4& p3, + const Scene *const scene, + const size_t itime, + const float ftime) const + { + const GridMesh* const mesh = scene->get(geomID()); + const GridMesh::Grid &g = mesh->grid(primID()); + gatherMB(p0,p1,p2,p3,mesh,g,itime,ftime); + } + + /* Gather the quads */ + __forceinline void gather(Vec3fa vtx[16], const Scene *const scene) const + { + const GridMesh* mesh = scene->get(geomID()); + const GridMesh::Grid &g = mesh->grid(primID()); + + /* first quad always valid */ + const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset; + const size_t vtxID01 = vtxID00 + 1; + const Vec3fa vtx00 = Vec3fa::loadu(mesh->vertexPtr(vtxID00)); + const Vec3fa vtx01 = Vec3fa::loadu(mesh->vertexPtr(vtxID01)); + const size_t vtxID10 = vtxID00 + g.lineVtxOffset; + const size_t vtxID11 = vtxID01 + g.lineVtxOffset; + const Vec3fa vtx10 = Vec3fa::loadu(mesh->vertexPtr(vtxID10)); + const Vec3fa vtx11 = Vec3fa::loadu(mesh->vertexPtr(vtxID11)); + + /* deltaX => vtx02, vtx12 */ + const size_t deltaX = invalid3x3X() ? 0 : 1; + const size_t vtxID02 = vtxID01 + deltaX; + const Vec3fa vtx02 = Vec3fa::loadu(mesh->vertexPtr(vtxID02)); + const size_t vtxID12 = vtxID11 + deltaX; + const Vec3fa vtx12 = Vec3fa::loadu(mesh->vertexPtr(vtxID12)); + + /* deltaY => vtx20, vtx21 */ + const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset; + const size_t vtxID20 = vtxID10 + deltaY; + const size_t vtxID21 = vtxID11 + deltaY; + const Vec3fa vtx20 = Vec3fa::loadu(mesh->vertexPtr(vtxID20)); + const Vec3fa vtx21 = Vec3fa::loadu(mesh->vertexPtr(vtxID21)); + + /* deltaX/deltaY => vtx22 */ + const size_t vtxID22 = vtxID11 + deltaX + deltaY; + const Vec3fa vtx22 = Vec3fa::loadu(mesh->vertexPtr(vtxID22)); + + vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10; + vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11; + vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20; + vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21; + } + + /* Gather the quads */ + __forceinline void gatherMB(vfloat4 vtx[16], const Scene *const scene, const size_t itime, const float ftime) const + { + const GridMesh* mesh = scene->get(geomID()); + const GridMesh::Grid &g = mesh->grid(primID()); + + /* first quad always valid */ + const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset; + const size_t vtxID01 = vtxID00 + 1; + const vfloat4 vtx00 = getVertexMB(mesh,vtxID00,itime,ftime); + const vfloat4 vtx01 = getVertexMB(mesh,vtxID01,itime,ftime); + const size_t vtxID10 = vtxID00 + g.lineVtxOffset; + const size_t vtxID11 = vtxID01 + g.lineVtxOffset; + const vfloat4 vtx10 = getVertexMB(mesh,vtxID10,itime,ftime); + const vfloat4 vtx11 = getVertexMB(mesh,vtxID11,itime,ftime); + + /* deltaX => vtx02, vtx12 */ + const size_t deltaX = invalid3x3X() ? 0 : 1; + const size_t vtxID02 = vtxID01 + deltaX; + const vfloat4 vtx02 = getVertexMB(mesh,vtxID02,itime,ftime); + const size_t vtxID12 = vtxID11 + deltaX; + const vfloat4 vtx12 = getVertexMB(mesh,vtxID12,itime,ftime); + + /* deltaY => vtx20, vtx21 */ + const size_t deltaY = invalid3x3Y() ? 0 : g.lineVtxOffset; + const size_t vtxID20 = vtxID10 + deltaY; + const size_t vtxID21 = vtxID11 + deltaY; + const vfloat4 vtx20 = getVertexMB(mesh,vtxID20,itime,ftime); + const vfloat4 vtx21 = getVertexMB(mesh,vtxID21,itime,ftime); + + /* deltaX/deltaY => vtx22 */ + const size_t vtxID22 = vtxID11 + deltaX + deltaY; + const vfloat4 vtx22 = getVertexMB(mesh,vtxID22,itime,ftime); + + vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10; + vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11; + vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20; + vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21; + } + + + /* Calculate the bounds of the subgrid */ + __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const + { + BBox3fa bounds = empty; + FATAL("not implemented yet"); + return bounds; + } + + /* Calculate the linear bounds of the primitive */ + __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime) + { + return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1)); + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) + { + LBBox3fa allBounds = empty; + FATAL("not implemented yet"); + return allBounds; + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) + { + LBBox3fa allBounds = empty; + FATAL("not implemented yet"); + return allBounds; + } + + + friend embree_ostream operator<<(embree_ostream cout, const SubGrid& sg) { + return cout << "SubGrid " << " ( x " << sg.x() << ", y = " << sg.y() << ", geomID = " << sg.geomID() << ", primID = " << sg.primID() << " )"; + } + + __forceinline unsigned int geomID() const { return _geomID; } + __forceinline unsigned int primID() const { return _primID; } + __forceinline unsigned int x() const { return (unsigned int)_x & 0x7fff; } + __forceinline unsigned int y() const { return (unsigned int)_y & 0x7fff; } + + private: + unsigned short _x; + unsigned short _y; + unsigned int _geomID; // geometry ID of mesh + unsigned int _primID; // primitive ID of primitive inside mesh + }; + + struct SubGridID { + unsigned short x; + unsigned short y; + unsigned int primID; + + __forceinline SubGridID() {} + __forceinline SubGridID(const unsigned int x, const unsigned int y, const unsigned int primID) : + x(x), y(y), primID(primID) {} + }; + + /* QuantizedBaseNode as large subgrid leaf */ + template + struct SubGridQBVHN + { + /* Virtual interface to query information about the quad type */ + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + __forceinline size_t size() const + { + for (size_t i=0;i::AABBNode node; + node.clear(); + for (size_t i=0;i::QuantizedBaseNode qnode; + + unsigned int _geomID; // geometry ID of mesh + + + friend embree_ostream operator<<(embree_ostream cout, const SubGridQBVHN& sg) { + cout << "SubGridQBVHN " << embree_endl; + for (size_t i=0;i + typename SubGridQBVHN::Type SubGridQBVHN::type; + + typedef SubGridQBVHN<4> SubGridQBVH4; + typedef SubGridQBVHN<8> SubGridQBVH8; + + + /* QuantizedBaseNode as large subgrid leaf */ + template + struct SubGridMBQBVHN + { + /* Virtual interface to query information about the quad type */ + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + __forceinline size_t size() const + { + for (size_t i=0;i::AABBNode node0,node1; + node0.clear(); + node1.clear(); + for (size_t i=0;i + __forceinline vfloat adjustTime(const vfloat &t) const { return time_scale * (t-time_offset); } + + public: + SubGridID subgridIDs[N]; + + typename BVHN::QuantizedBaseNodeMB qnode; + + float time_offset; + float time_scale; + unsigned int _geomID; // geometry ID of mesh + + + friend embree_ostream operator<<(embree_ostream cout, const SubGridMBQBVHN& sg) { + cout << "SubGridMBQBVHN " << embree_endl; + for (size_t i=0;i + struct SubGridIntersector1Moeller + { + typedef SubGridQBVHN Primitive; + typedef SubGridQuadMIntersector1MoellerTrumbore<4,filter> Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(normal.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(shadow.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid) + { + STAT3(point_query.trav_prims,1,1,1); + AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID()); + assert(accel); + context->geomID = subgrid.geomID(); + context->primID = subgrid.primID(); + return accel->pointQuery(query, context); + } + + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1 isec1; + + for (size_t i=0;i dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); +#if defined(__AVX__) + STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1); +#endif + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (unlikely(dist[ID] > ray.tfar)) continue; + intersect(pre,ray,context,prim[i].subgrid(ID)); + } + } + } + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + + { + BVHNQuantizedBaseNodeIntersector1 isec1; + + for (size_t i=0;i dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (occluded(pre,ray,context,prim[i].subgrid(ID))) + return true; + } + } + return false; + } + + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery &tquery, size_t& lazy_node) + { + bool changed = false; + for (size_t i=0;i dist; + size_t mask; + if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) { + mask = BVHNQuantizedBaseNodePointQuerySphere1::pointQuery(&prim[i].qnode,tquery,dist); + } else { + mask = BVHNQuantizedBaseNodePointQueryAABB1::pointQuery(&prim[i].qnode,tquery,dist); + } + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + changed |= pointQuery(query, context, prim[i].subgrid(ID)); + } + } + return changed; + } + }; + + template + struct SubGridIntersector1Pluecker + { + typedef SubGridQBVHN Primitive; + typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(normal.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(shadow.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid) + { + STAT3(point_query.trav_prims,1,1,1); + AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID()); + context->geomID = subgrid.geomID(); + context->primID = subgrid.primID(); + return accel->pointQuery(query, context); + } + + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1 isec1; + + for (size_t i=0;i dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); +#if defined(__AVX__) + STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1); +#endif + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (unlikely(dist[ID] > ray.tfar)) continue; + intersect(pre,ray,context,prim[i].subgrid(ID)); + } + } + } + + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1 isec1; + + for (size_t i=0;i dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (occluded(pre,ray,context,prim[i].subgrid(ID))) + return true; + } + } + return false; + } + + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery &tquery, size_t& lazy_node) + { + bool changed = false; + for (size_t i=0;i dist; + size_t mask; + if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) { + mask = BVHNQuantizedBaseNodePointQuerySphere1::pointQuery(&prim[i].qnode,tquery,dist); + } else { + mask = BVHNQuantizedBaseNodePointQueryAABB1::pointQuery(&prim[i].qnode,tquery,dist); + } +#if defined(__AVX__) + STAT3(point_query.trav_hit_boxes[popcnt(mask)],1,1,1); +#endif + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + changed |= pointQuery(query, context, prim[i].subgrid(ID)); + } + } + return changed; + } + }; + + template + struct SubGridIntersectorKMoeller + { + typedef SubGridQBVHN Primitive; + typedef SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> Precalculations; + + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const SubGrid& subgrid) + { + Vec3fa vtx[16]; + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + subgrid.gather(vtx,context->scene); + for (unsigned int i=0; i<4; i++) + { + const Vec3vf p0 = vtx[i*4+0]; + const Vec3vf p1 = vtx[i*4+1]; + const Vec3vf p2 = vtx[i*4+2]; + const Vec3vf p3 = vtx[i*4+3]; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i)); + } + } + + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const SubGrid& subgrid) + { + vbool valid0 = valid_i; + Vec3fa vtx[16]; + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + subgrid.gather(vtx,context->scene); + for (unsigned int i=0; i<4; i++) + { + const Vec3vf p0 = vtx[i*4+0]; + const Vec3vf p1 = vtx[i*4+1]; + const Vec3vf p2 = vtx[i*4+2]; + const Vec3vf p3 = vtx[i*4+3]; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i))) + break; + } + return !valid0; + } + + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(normal.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(shadow.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid); + } + + template + static __forceinline void intersect(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersectorK isecK; + for (size_t j=0;j dist; + while(m_valid) + { + const size_t i = bscf(m_valid); + if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue; + intersect(valid,pre,ray,context,prim[j].subgrid(i)); + } + } + } + + template + static __forceinline vbool occluded(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersectorK isecK; + vbool valid0 = valid; + for (size_t j=0;j dist; + while(m_valid) + { + const size_t i = bscf(m_valid); + if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue; + valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i)); + if (none(valid0)) break; + } + } + return !valid0; + } + + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1 isec1; + + for (size_t i=0;i dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (unlikely(dist[ID] > ray.tfar[k])) continue; + intersect(pre,ray,k,context,prim[i].subgrid(ID)); + } + } + } + + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1 isec1; + + for (size_t i=0;i dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (occluded(pre,ray,k,context,prim[i].subgrid(ID))) + return true; + } + } + return false; + } + }; + + + template + struct SubGridIntersectorKPluecker + { + typedef SubGridQBVHN Primitive; + typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations; + + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const SubGrid& subgrid) + { + Vec3fa vtx[16]; + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + subgrid.gather(vtx,context->scene); + for (unsigned int i=0; i<4; i++) + { + const Vec3vf p0 = vtx[i*4+0]; + const Vec3vf p1 = vtx[i*4+1]; + const Vec3vf p2 = vtx[i*4+2]; + const Vec3vf p3 = vtx[i*4+3]; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i)); + } + } + + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const SubGrid& subgrid) + { + vbool valid0 = valid_i; + Vec3fa vtx[16]; + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + subgrid.gather(vtx,context->scene); + for (unsigned int i=0; i<4; i++) + { + const Vec3vf p0 = vtx[i*4+0]; + const Vec3vf p1 = vtx[i*4+1]; + const Vec3vf p2 = vtx[i*4+2]; + const Vec3vf p3 = vtx[i*4+3]; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i))) + break; + } + return !valid0; + } + + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(normal.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(shadow.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene); + return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid); + } + + template + static __forceinline void intersect(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersectorK isecK; + for (size_t j=0;j dist; + while(m_valid) + { + const size_t i = bscf(m_valid); + if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue; + intersect(valid,pre,ray,context,prim[j].subgrid(i)); + } + } + } + + template + static __forceinline vbool occluded(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersectorK isecK; + vbool valid0 = valid; + for (size_t j=0;j dist; + while(m_valid) + { + const size_t i = bscf(m_valid); + if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue; + valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i)); + if (none(valid0)) break; + } + } + return !valid0; + } + + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1 isec1; + + for (size_t i=0;i dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (unlikely(dist[ID] > ray.tfar[k])) continue; + intersect(pre,ray,k,context,prim[i].subgrid(ID)); + } + } + } + + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1 isec1; + + for (size_t i=0;i dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask())); + + if (occluded(pre,ray,k,context,prim[i].subgrid(ID))) + return true; + } + } + return false; + } + }; + + + + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h new file mode 100644 index 00000000000..f65b4abf61c --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h @@ -0,0 +1,493 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "subgrid.h" +#include "quad_intersector_moeller.h" + +namespace embree +{ + namespace isa + { + + /* ----------------------------- */ + /* -- single ray intersectors -- */ + /* ----------------------------- */ + + template + __forceinline void interpolateUV(MoellerTrumboreHitM &hit,const GridMesh::Grid &g, const SubGrid& subgrid) + { + /* correct U,V interpolation across the entire grid */ + const vint sx((int)subgrid.x()); + const vint sy((int)subgrid.y()); + const vint sxM(sx + vint(0,1,1,0)); + const vint syM(sy + vint(0,0,1,1)); + const float inv_resX = rcp((float)((int)g.resX-1)); + const float inv_resY = rcp((float)((int)g.resY-1)); + hit.U = (hit.U + (vfloat)sxM * hit.absDen) * inv_resX; + hit.V = (hit.V + (vfloat)syM * hit.absDen) * inv_resY; + } + + template + struct SubGridQuadMIntersector1MoellerTrumbore; + + template + struct SubGridQuadMIntersector1MoellerTrumbore + { + __forceinline SubGridQuadMIntersector1MoellerTrumbore() {} + + __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {} + + __forceinline void intersect(RayHit& ray, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + MoellerTrumboreHitM hit; + MoellerTrumboreIntersector1 intersector(ray,nullptr); + Intersect1EpilogMU epilog(ray,context,subgrid.geomID(),subgrid.primID()); + + /* intersect first triangle */ + if (intersector.intersect(ray,v0,v1,v3,hit)) + { + interpolateUV(hit,g,subgrid); + epilog(hit.valid,hit); + } + + /* intersect second triangle */ + if (intersector.intersect(ray,v2,v3,v1,hit)) + { + hit.U = hit.absDen - hit.U; + hit.V = hit.absDen - hit.V; + interpolateUV(hit,g,subgrid); + epilog(hit.valid,hit); + } + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + MoellerTrumboreHitM hit; + MoellerTrumboreIntersector1 intersector(ray,nullptr); + Occluded1EpilogMU epilog(ray,context,subgrid.geomID(),subgrid.primID()); + + /* intersect first triangle */ + if (intersector.intersect(ray,v0,v1,v3,hit)) + { + interpolateUV(hit,g,subgrid); + if (epilog(hit.valid,hit)) + return true; + } + + /* intersect second triangle */ + if (intersector.intersect(ray,v2,v3,v1,hit)) + { + hit.U = hit.absDen - hit.U; + hit.V = hit.absDen - hit.V; + interpolateUV(hit,g,subgrid); + if (epilog(hit.valid,hit)) + return true; + } + return false; + } + }; + +#if defined (__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template + struct SubGridQuadMIntersector1MoellerTrumbore<4,filter> + { + __forceinline SubGridQuadMIntersector1MoellerTrumbore() {} + + __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {} + + template + __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + MoellerTrumboreHitM<8> hit; + MoellerTrumboreIntersector1<8> intersector(ray,nullptr); + const vbool8 flags(0,0,0,0,1,1,1,1); + if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit))) + { + vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen; + +#if !defined(EMBREE_BACKFACE_CULLING) + hit.U = select(flags,absDen-V,U); + hit.V = select(flags,absDen-U,V); + hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); +#else + hit.U = select(flags,absDen-U,U); + hit.V = select(flags,absDen-V,V); +#endif + /* correct U,V interpolation across the entire grid */ + const vint8 sx((int)subgrid.x()); + const vint8 sy((int)subgrid.y()); + const vint8 sx8(sx + vint8(0,1,1,0,0,1,1,0)); + const vint8 sy8(sy + vint8(0,0,1,1,0,0,1,1)); + const float inv_resX = rcp((float)((int)g.resX-1)); + const float inv_resY = rcp((float)((int)g.resY-1)); + hit.U = (hit.U + (vfloat8)sx8 * absDen) * inv_resX; + hit.V = (hit.V + (vfloat8)sy8 * absDen) * inv_resY; + + if (unlikely(epilog(hit.valid,hit))) + return true; + } + return false; + } + + __forceinline bool intersect(RayHit& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID())); + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID())); + } + }; + +#endif + + // ============================================================================================================================ + // ============================================================================================================================ + // ============================================================================================================================ + + + /* ----------------------------- */ + /* -- ray packet intersectors -- */ + /* ----------------------------- */ + + template + struct SubGridQuadHitK + { + __forceinline SubGridQuadHitK(const vfloat& U, + const vfloat& V, + const vfloat& T, + const vfloat& absDen, + const Vec3vf& Ng, + const vbool& flags, + const GridMesh::Grid &g, + const SubGrid& subgrid, + const unsigned int i) + : U(U), V(V), T(T), absDen(absDen), flags(flags), tri_Ng(Ng), g(g), subgrid(subgrid), i(i) {} + + __forceinline std::tuple,vfloat,vfloat,Vec3vf> operator() () const + { + const vfloat rcpAbsDen = rcp(absDen); + const vfloat t = T * rcpAbsDen; + const vfloat u0 = min(U * rcpAbsDen,1.0f); + const vfloat v0 = min(V * rcpAbsDen,1.0f); + const vfloat u1 = vfloat(1.0f) - u0; + const vfloat v1 = vfloat(1.0f) - v0; + const vfloat uu = select(flags,u1,u0); + const vfloat vv = select(flags,v1,v0); + const unsigned int sx = subgrid.x() + (unsigned int)(i % 2); + const unsigned int sy = subgrid.y() + (unsigned int)(i >>1); + const float inv_resX = rcp((float)(int)(g.resX-1)); + const float inv_resY = rcp((float)(int)(g.resY-1)); + const vfloat u = (uu + (float)(int)sx) * inv_resX; + const vfloat v = (vv + (float)(int)sy) * inv_resY; + const Vec3vf Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z); + return std::make_tuple(u,v,t,Ng); + } + + private: + const vfloat U; + const vfloat V; + const vfloat T; + const vfloat absDen; + const vbool flags; + const Vec3vf tri_Ng; + + const GridMesh::Grid &g; + const SubGrid& subgrid; + const size_t i; + }; + + template + struct SubGridQuadMIntersectorKMoellerTrumboreBase + { + __forceinline SubGridQuadMIntersectorKMoellerTrumboreBase(const vbool& valid, const RayK& ray) {} + + template + __forceinline vbool intersectK(const vbool& valid0, + RayK& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + const Vec3vf& tri_Ng, + const vbool& flags, + const GridMesh::Grid &g, + const SubGrid &subgrid, + const unsigned int i, + const Epilog& epilog) const + { + /* calculate denominator */ + vbool valid = valid0; + const Vec3vf C = tri_v0 - ray.org; + const Vec3vf R = cross(C,ray.dir); + const vfloat den = dot(tri_Ng,ray.dir); + const vfloat absDen = abs(den); + const vfloat sgnDen = signmsk(den); + + /* test against edge p2 p0 */ + const vfloat U = dot(R,tri_e2) ^ sgnDen; + valid &= U >= 0.0f; + if (likely(none(valid))) return false; + + /* test against edge p0 p1 */ + const vfloat V = dot(R,tri_e1) ^ sgnDen; + valid &= V >= 0.0f; + if (likely(none(valid))) return false; + + /* test against edge p1 p2 */ + const vfloat W = absDen-U-V; + valid &= W >= 0.0f; + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat T = dot(tri_Ng,C) ^ sgnDen; + valid &= (absDen*ray.tnear() < T) & (T <= absDen*ray.tfar); + if (unlikely(none(valid))) return false; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + valid &= den < vfloat(zero); + if (unlikely(none(valid))) return false; +#else + valid &= den != vfloat(zero); + if (unlikely(none(valid))) return false; +#endif + + /* calculate hit information */ + SubGridQuadHitK hit(U,V,T,absDen,tri_Ng,flags,g,subgrid,i); + return epilog(valid,hit); + } + + template + __forceinline vbool intersectK(const vbool& valid0, + RayK& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_v1, + const Vec3vf& tri_v2, + const vbool& flags, + const GridMesh::Grid &g, + const SubGrid &subgrid, + const unsigned int i, + const Epilog& epilog) const + { + const Vec3vf e1 = tri_v0-tri_v1; + const Vec3vf e2 = tri_v2-tri_v0; + const Vec3vf Ng = cross(e2,e1); + return intersectK(valid0,ray,tri_v0,e1,e2,Ng,flags,g,subgrid,i,epilog); + } + + template + __forceinline bool intersectK(const vbool& valid0, + RayK& ray, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const Vec3vf& v3, + const GridMesh::Grid &g, + const SubGrid &subgrid, + const unsigned int i, + const Epilog& epilog) const + { + intersectK(valid0,ray,v0,v1,v3,vbool(false),g,subgrid,i,epilog); + if (none(valid0)) return true; + intersectK(valid0,ray,v2,v3,v1,vbool(true ),g,subgrid,i,epilog); + return none(valid0); + } + + static __forceinline bool intersect1(RayK& ray, + size_t k, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + const Vec3vf& tri_Ng, + MoellerTrumboreHitM &hit) + { + /* calculate denominator */ + const Vec3vf O = broadcast>(ray.org,k); + const Vec3vf D = broadcast>(ray.dir,k); + const Vec3vf C = Vec3vf(tri_v0) - O; + const Vec3vf R = cross(C,D); + const vfloat den = dot(Vec3vf(tri_Ng),D); + const vfloat absDen = abs(den); + const vfloat sgnDen = signmsk(den); + + /* perform edge tests */ + const vfloat U = dot(R,Vec3vf(tri_e2)) ^ sgnDen; + const vfloat V = dot(R,Vec3vf(tri_e1)) ^ sgnDen; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + vbool valid = (den < vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#else + vbool valid = (den != vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#endif + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat T = dot(Vec3vf(tri_Ng),C) ^ sgnDen; + valid &= (absDen*vfloat(ray.tnear()[k]) < T) & (T <= absDen*vfloat(ray.tfar[k])); + if (likely(none(valid))) return false; + + /* calculate hit information */ + new (&hit) MoellerTrumboreHitM(valid,U,V,T,absDen,tri_Ng); + return true; + } + + static __forceinline bool intersect1(RayK& ray, + size_t k, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + MoellerTrumboreHitM &hit) + { + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v2-v0; + const Vec3vf Ng = cross(e2,e1); + return intersect1(ray,k,v0,e1,e2,Ng,hit); + } + + }; + + template + struct SubGridQuadMIntersectorKMoellerTrumbore : public SubGridQuadMIntersectorKMoellerTrumboreBase + { + __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool& valid, const RayK& ray) + : SubGridQuadMIntersectorKMoellerTrumboreBase(valid,ray) {} + + __forceinline void intersect1(RayHitK& ray, size_t k, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const + { + Intersect1KEpilogMU epilog(ray,k,context,subgrid.geomID(),subgrid.primID()); + + MoellerTrumboreHitM<4> hit; + if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,hit)) + { + interpolateUV(hit,g,subgrid); + epilog(hit.valid,hit); + } + + if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,hit)) + { + hit.U = hit.absDen - hit.U; + hit.V = hit.absDen - hit.V; + interpolateUV(hit,g,subgrid); + epilog(hit.valid,hit); + } + + } + + __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const + { + Occluded1KEpilogMU epilog(ray,k,context,subgrid.geomID(),subgrid.primID()); + + MoellerTrumboreHitM<4> hit; + if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,hit)) + { + interpolateUV(hit,g,subgrid); + if (epilog(hit.valid,hit)) return true; + } + + if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,hit)) + { + hit.U = hit.absDen - hit.U; + hit.V = hit.absDen - hit.V; + interpolateUV(hit,g,subgrid); + if (epilog(hit.valid,hit)) return true; + } + return false; + } + }; + + +#if defined (__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template + struct SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> : public SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter> + { + __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool& valid, const RayK& ray) + : SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {} + + template + __forceinline bool intersect1(RayK& ray, size_t k,const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const GridMesh::Grid &g, const SubGrid &subgrid, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + const vbool8 flags(0,0,0,0,1,1,1,1); + + MoellerTrumboreHitM<8> hit; + if (SubGridQuadMIntersectorKMoellerTrumboreBase<8,K,filter>::intersect1(ray,k,vtx0,vtx1,vtx2,hit)) + { + vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen; +#if !defined(EMBREE_BACKFACE_CULLING) + hit.U = select(flags,absDen-V,U); + hit.V = select(flags,absDen-U,V); + hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); +#else + hit.U = select(flags,absDen-U,U); + hit.V = select(flags,absDen-V,V); +#endif + + /* correct U,V interpolation across the entire grid */ + const vint8 sx((int)subgrid.x()); + const vint8 sy((int)subgrid.y()); + const vint8 sx8(sx + vint8(0,1,1,0,0,1,1,0)); + const vint8 sy8(sy + vint8(0,0,1,1,0,0,1,1)); + const float inv_resX = rcp((float)((int)g.resX-1)); + const float inv_resY = rcp((float)((int)g.resY-1)); + hit.U = (hit.U + (vfloat8)sx8 * absDen) * inv_resX; + hit.V = (hit.V + (vfloat8)sy8 * absDen) * inv_resY; + if (unlikely(epilog(hit.valid,hit))) + return true; + + } + return false; + } + + __forceinline bool intersect1(RayHitK& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const + { + return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Intersect1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID())); + } + + __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const + { + return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Occluded1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID())); + } + }; + +#endif + + + + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h new file mode 100644 index 00000000000..1cd88aa799e --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h @@ -0,0 +1,508 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "subgrid.h" +#include "quad_intersector_moeller.h" +#include "quad_intersector_pluecker.h" + +namespace embree +{ + namespace isa + { + + template + struct SubGridQuadHitPlueckerM + { + __forceinline SubGridQuadHitPlueckerM() {} + + __forceinline SubGridQuadHitPlueckerM(const vbool& valid, + const vfloat& U, + const vfloat& V, + const vfloat& UVW, + const vfloat& t, + const Vec3vf& Ng, + const vbool& flags) : valid(valid), vt(t) + { + const vbool invalid = abs(UVW) < min_rcp_input; + const vfloat rcpUVW = select(invalid,vfloat(0.0f),rcp(UVW)); + const vfloat u = min(U * rcpUVW,1.0f); + const vfloat v = min(V * rcpUVW,1.0f); + const vfloat u1 = vfloat(1.0f) - u; + const vfloat v1 = vfloat(1.0f) - v; +#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING) + vu = select(flags,u1,u); + vv = select(flags,v1,v); + vNg = Vec3vf(Ng.x,Ng.y,Ng.z); +#else + const vfloat flip = select(flags,vfloat(-1.0f),vfloat(1.0f)); + vv = select(flags,u1,v); + vu = select(flags,v1,u); + vNg = Vec3vf(flip*Ng.x,flip*Ng.y,flip*Ng.z); +#endif + } + + __forceinline void finalize() + { + } + + __forceinline Vec2f uv(const size_t i) + { + const float u = vu[i]; + const float v = vv[i]; + return Vec2f(u,v); + } + + __forceinline float t(const size_t i) { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + public: + vbool valid; + vfloat vu; + vfloat vv; + vfloat vt; + Vec3vf vNg; + }; + + template + __forceinline void interpolateUV(SubGridQuadHitPlueckerM &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const vint &stepX, const vint &stepY) + { + /* correct U,V interpolation across the entire grid */ + const vint sx((int)subgrid.x()); + const vint sy((int)subgrid.y()); + const vint sxM(sx + stepX); + const vint syM(sy + stepY); + const float inv_resX = rcp((float)((int)g.resX-1)); + const float inv_resY = rcp((float)((int)g.resY-1)); + hit.vu = (hit.vu + vfloat(sxM)) * inv_resX; + hit.vv = (hit.vv + vfloat(syM)) * inv_resY; + } + + template + __forceinline static bool intersectPluecker(Ray& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_v1, + const Vec3vf& tri_v2, + const vbool& flags, + SubGridQuadHitPlueckerM &hit) + { + /* calculate vertices relative to ray origin */ + const Vec3vf O = Vec3vf((Vec3fa)ray.org); + const Vec3vf D = Vec3vf((Vec3fa)ray.dir); + const Vec3vf v0 = tri_v0-O; + const Vec3vf v1 = tri_v1-O; + const Vec3vf v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf e0 = v2-v0; + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v1-v2; + + /* perform edge tests */ + const vfloat U = dot(cross(e0,v2+v0),D); + const vfloat V = dot(cross(e1,v0+v1),D); + const vfloat W = dot(cross(e2,v1+v2),D); + const vfloat UVW = U+V+W; + const vfloat eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + vbool valid = max(U,V,W) <= eps; +#else + vbool valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); + const vfloat den = twice(dot(Ng,D)); + + /* perform depth test */ + const vfloat T = twice(dot(v0,Ng)); + const vfloat t = rcp(den)*T; + valid &= vfloat(ray.tnear()) <= t & t <= vfloat(ray.tfar); + valid &= den != vfloat(zero); + if (unlikely(none(valid))) return false; + + /* update hit information */ + new (&hit) SubGridQuadHitPlueckerM(valid,U,V,UVW,t,Ng,flags); + return true; + } + + template + struct SubGridQuadMIntersector1Pluecker; + + template + struct SubGridQuadMIntersector1Pluecker + { + __forceinline SubGridQuadMIntersector1Pluecker() {} + + __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {} + + __forceinline void intersect(RayHit& ray, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + SubGridQuadHitPlueckerM hit; + Intersect1EpilogMU epilog(ray,context,subgrid.geomID(),subgrid.primID()); + + /* intersect first triangle */ + if (intersectPluecker(ray,v0,v1,v3,vbool(false),hit)) + { + interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); + epilog(hit.valid,hit); + } + + /* intersect second triangle */ + if (intersectPluecker(ray,v2,v3,v1,vbool(true),hit)) + { + interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); + epilog(hit.valid,hit); + } + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + SubGridQuadHitPlueckerM hit; + Occluded1EpilogMU epilog(ray,context,subgrid.geomID(),subgrid.primID()); + + /* intersect first triangle */ + if (intersectPluecker(ray,v0,v1,v3,vbool(false),hit)) + { + interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); + if (epilog(hit.valid,hit)) + return true; + } + + /* intersect second triangle */ + if (intersectPluecker(ray,v2,v3,v1,vbool(true),hit)) + { + interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); + if (epilog(hit.valid,hit)) + return true; + } + + return false; + } + }; + +#if defined (__AVX__) + + /*! Intersects 4 quads with 1 ray using AVX */ + template + struct SubGridQuadMIntersector1Pluecker<4,filter> + { + __forceinline SubGridQuadMIntersector1Pluecker() {} + + __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {} + + template + __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const + { + const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z)); +#if !defined(EMBREE_BACKFACE_CULLING) + const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z)); + const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z)); +#else + const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z)); + const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z)); +#endif + SubGridQuadHitPlueckerM<8> hit; + const vbool8 flags(0,0,0,0,1,1,1,1); + if (unlikely(intersectPluecker(ray,vtx0,vtx1,vtx2,flags,hit))) + { + /* correct U,V interpolation across the entire grid */ + interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1)); + if (unlikely(epilog(hit.valid,hit))) + return true; + } + return false; + } + + __forceinline bool intersect(RayHit& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID())); + } + + __forceinline bool occluded(Ray& ray, IntersectContext* context, + const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, + const GridMesh::Grid &g, const SubGrid& subgrid) const + { + return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID())); + } + }; + +#endif + + + /* ----------------------------- */ + /* -- ray packet intersectors -- */ + /* ----------------------------- */ + + template + struct SubGridQuadHitPlueckerK + { + __forceinline SubGridQuadHitPlueckerK(const vfloat& U, + const vfloat& V, + const vfloat& UVW, + const vfloat& t, + const Vec3vf& Ng, + const vbool& flags, + const GridMesh::Grid &g, + const SubGrid& subgrid, + const unsigned int i) + : U(U), V(V), UVW(UVW), t(t), flags(flags), tri_Ng(Ng), g(g), subgrid(subgrid), i(i) {} + + __forceinline std::tuple,vfloat,vfloat,Vec3vf> operator() () const + { + const vbool invalid = abs(UVW) < min_rcp_input; + const vfloat rcpUVW = select(invalid,vfloat(0.0f),rcp(UVW)); + const vfloat u0 = min(U * rcpUVW,1.0f); + const vfloat v0 = min(V * rcpUVW,1.0f); + const vfloat u1 = vfloat(1.0f) - u0; + const vfloat v1 = vfloat(1.0f) - v0; + const vfloat uu = select(flags,u1,u0); + const vfloat vv = select(flags,v1,v0); + const unsigned int sx = subgrid.x() + (unsigned int)(i % 2); + const unsigned int sy = subgrid.y() + (unsigned int)(i >>1); + const float inv_resX = rcp((float)(int)(g.resX-1)); + const float inv_resY = rcp((float)(int)(g.resY-1)); + const vfloat u = (uu + (float)(int)sx) * inv_resX; + const vfloat v = (vv + (float)(int)sy) * inv_resY; + const Vec3vf Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z); + return std::make_tuple(u,v,t,Ng); + } + + private: + const vfloat U; + const vfloat V; + const vfloat UVW; + const vfloat t; + const vfloat absDen; + const vbool flags; + const Vec3vf tri_Ng; + + const GridMesh::Grid &g; + const SubGrid& subgrid; + const size_t i; + }; + + + template + struct SubGridQuadMIntersectorKPlueckerBase + { + __forceinline SubGridQuadMIntersectorKPlueckerBase(const vbool& valid, const RayK& ray) {} + + template + __forceinline vbool intersectK(const vbool& valid0, + RayK& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_v1, + const Vec3vf& tri_v2, + const Vec3vf& tri_Ng, + const vbool& flags, + const GridMesh::Grid &g, + const SubGrid &subgrid, + const unsigned int i, + const Epilog& epilog) const + { + /* calculate denominator */ + /* calculate vertices relative to ray origin */ + vbool valid = valid0; + const Vec3vf O = ray.org; + const Vec3vf D = ray.dir; + const Vec3vf v0 = tri_v0-O; + const Vec3vf v1 = tri_v1-O; + const Vec3vf v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf e0 = v2-v0; + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v1-v2; + + /* perform edge tests */ + const vfloat U = dot(Vec3vf(cross(e0,v2+v0)),D); + const vfloat V = dot(Vec3vf(cross(e1,v0+v1)),D); + const vfloat W = dot(Vec3vf(cross(e2,v1+v2)),D); + const vfloat UVW = U+V+W; + const vfloat eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + valid &= max(U,V,W) <= eps; +#else + valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); + const vfloat den = twice(dot(Vec3vf(Ng),D)); + + /* perform depth test */ + const vfloat T = twice(dot(v0,Vec3vf(Ng))); + const vfloat t = rcp(den)*T; + valid &= ray.tnear() <= t & t <= ray.tfar; + valid &= den != vfloat(zero); + if (unlikely(none(valid))) return false; + + /* calculate hit information */ + SubGridQuadHitPlueckerK hit(U,V,UVW,t,tri_Ng,flags,g,subgrid,i); + return epilog(valid,hit); + } + + template + __forceinline vbool intersectK(const vbool& valid0, + RayK& ray, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const vbool& flags, + const GridMesh::Grid &g, + const SubGrid &subgrid, + const unsigned int i, + const Epilog& epilog) const + { + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v2-v0; + const Vec3vf Ng = cross(e2,e1); + return intersectK(valid0,ray,v0,v1,v2,Ng,flags,g,subgrid,i,epilog); + } + + template + __forceinline bool intersectK(const vbool& valid0, + RayK& ray, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const Vec3vf& v3, + const GridMesh::Grid &g, + const SubGrid &subgrid, + const unsigned int i, + const Epilog& epilog) const + { + intersectK(valid0,ray,v0,v1,v3,vbool(false),g,subgrid,i,epilog); + if (none(valid0)) return true; + intersectK(valid0,ray,v2,v3,v1,vbool(true ),g,subgrid,i,epilog); + return none(valid0); + } + + static __forceinline bool intersect1(RayK& ray, + size_t k, + const Vec3vf& tri_v0, + const Vec3vf& tri_v1, + const Vec3vf& tri_v2, + const Vec3vf& tri_Ng, + const vbool& flags, + SubGridQuadHitPlueckerM &hit) + { + /* calculate vertices relative to ray origin */ + const Vec3vf O = broadcast>(ray.org,k); + const Vec3vf D = broadcast>(ray.dir,k); + const Vec3vf v0 = tri_v0-O; + const Vec3vf v1 = tri_v1-O; + const Vec3vf v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf e0 = v2-v0; + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v1-v2; + + /* perform edge tests */ + const vfloat U = dot(cross(e0,v2+v0),D); + const vfloat V = dot(cross(e1,v0+v1),D); + const vfloat W = dot(cross(e2,v1+v2),D); + const vfloat UVW = U+V+W; + const vfloat eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + vbool valid = max(U,V,W) <= eps ; +#else + vbool valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); + const vfloat den = twice(dot(Ng,D)); + + /* perform depth test */ + const vfloat T = twice(dot(v0,Ng)); + const vfloat t = rcp(den)*T; + valid &= vfloat(ray.tnear()[k]) <= t & t <= vfloat(ray.tfar[k]); + if (unlikely(none(valid))) return false; + + /* avoid division by 0 */ + valid &= den != vfloat(zero); + if (unlikely(none(valid))) return false; + + /* update hit information */ + new (&hit) SubGridQuadHitPlueckerM(valid,U,V,UVW,t,tri_Ng,flags); + return true; + } + + static __forceinline bool intersect1(RayK& ray, + size_t k, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const vbool& flags, + SubGridQuadHitPlueckerM &hit) + { + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v2-v0; + const Vec3vf Ng = cross(e2,e1); // FIXME: optimize!!! + return intersect1(ray,k,v0,v1,v2,Ng,flags,hit); + } + + }; + + template + struct SubGridQuadMIntersectorKPluecker : public SubGridQuadMIntersectorKPlueckerBase + { + __forceinline SubGridQuadMIntersectorKPluecker(const vbool& valid, const RayK& ray) + : SubGridQuadMIntersectorKPlueckerBase(valid,ray) {} + + __forceinline void intersect1(RayHitK& ray, size_t k, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const + { + Intersect1KEpilogMU epilog(ray,k,context,subgrid.geomID(),subgrid.primID()); + + SubGridQuadHitPlueckerM<4> hit; + if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,vboolf4(false),hit)) + { + interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); + epilog(hit.valid,hit); + } + + if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,vboolf4(true),hit)) + { + interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); + epilog(hit.valid,hit); + } + + } + + __forceinline bool occluded1(RayK& ray, size_t k, IntersectContext* context, + const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const Vec3vf& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const + { + Occluded1KEpilogMU epilog(ray,k,context,subgrid.geomID(),subgrid.primID()); + + SubGridQuadHitPlueckerM<4> hit; + if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,vboolf4(false),hit)) + { + interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); + if (epilog(hit.valid,hit)) return true; + } + + if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,vboolf4(true),hit)) + { + interpolateUV(hit,g,subgrid,vint(0,1,1,0),vint(0,0,1,1)); + if (epilog(hit.valid,hit)) return true; + } + return false; + } + }; + + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h new file mode 100644 index 00000000000..400a88b985a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h @@ -0,0 +1,236 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "subgrid_intersector.h" + +namespace embree +{ + namespace isa + { + template + struct SubGridMBIntersector1Pluecker + { + typedef SubGridMBQBVHN Primitive; + typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(normal.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + float ftime; + const int itime = mesh->timeSegment(ray.time(), ftime); + Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime); + pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(shadow.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + float ftime; + const int itime = mesh->timeSegment(ray.time(), ftime); + + Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime); + return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid) + { + return PrimitivePointQuery1::pointQuery(query, context, subgrid); + } + + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1 isec1; + for (size_t i=0;i dist; + const float time = prim[i].adjustTime(ray.time()); + + assert(time <= 1.0f); + size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); +#if defined(__AVX__) + STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1); +#endif + while(mask != 0) + { + const size_t ID = bscf(mask); + if (unlikely(dist[ID] > ray.tfar)) continue; + intersect(pre,ray,context,prim[i].subgrid(ID)); + } + } + } + + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1 isec1; + for (size_t i=0;i dist; + size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + if (occluded(pre,ray,context,prim[i].subgrid(ID))) + return true; + } + } + return false; + } + + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery &tquery, size_t& lazy_node) + { + assert(false && "not implemented"); + return false; + } + }; + + + template + struct SubGridMBIntersectorKPluecker + { + typedef SubGridMBQBVHN Primitive; + typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations; + + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const SubGrid& subgrid) + { + size_t m_valid = movemask(valid_i); + while(m_valid) + { + size_t ID = bscf(m_valid); + intersect(pre,ray,ID,context,subgrid); + } + } + + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const SubGrid& subgrid) + { + vbool valid0 = valid_i; + size_t m_valid = movemask(valid_i); + while(m_valid) + { + size_t ID = bscf(m_valid); + if (occluded(pre,ray,ID,context,subgrid)) + clear(valid0,ID); + } + return !valid0; + } + + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(normal.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + vfloat ftime; + const vint itime = mesh->timeSegment(ray.time(), ftime); + Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]); + pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid); + } + + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const SubGrid& subgrid) + { + STAT3(shadow.trav_prims,1,1,1); + const GridMesh* mesh = context->scene->get(subgrid.geomID()); + const GridMesh::Grid &g = mesh->grid(subgrid.primID()); + + vfloat ftime; + const vint itime = mesh->timeSegment(ray.time(), ftime); + Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]); + return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid); + } + + template + static __forceinline void intersect(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersectorK isecK; + for (size_t j=0;j time = prim[j].adjustTime(ray.time()); + + vfloat dist; + while(m_valid) + { + const size_t i = bscf(m_valid); + if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue; + intersect(valid,pre,ray,context,prim[j].subgrid(i)); + } + } + } + + template + static __forceinline vbool occluded(const vbool& valid, const Accel::Intersectors* This, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersectorK isecK; + + vbool valid0 = valid; + for (size_t j=0;j time = prim[j].adjustTime(ray.time()); + vfloat dist; + while(m_valid) + { + const size_t i = bscf(m_valid); + if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue; + valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i)); + if (none(valid0)) break; + } + } + return !valid0; + } + + template + static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1 isec1; + for (size_t i=0;i dist; + const float time = prim[i].adjustTime(ray.time()[k]); + assert(time <= 1.0f); + + size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + if (unlikely(dist[ID] > ray.tfar[k])) continue; + intersect(pre,ray,k,context,prim[i].subgrid(ID)); + } + } + } + + template + static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay &tray, size_t& lazy_node) + { + BVHNQuantizedBaseNodeIntersector1 isec1; + + for (size_t i=0;i dist; + const float time = prim[i].adjustTime(ray.time()[k]); + assert(time <= 1.0f); + + size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); + while(mask != 0) + { + const size_t ID = bscf(mask); + if (occluded(pre,ray,k,context,prim[i].subgrid(ID))) + return true; + } + } + return false; + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle.h b/thirdparty/embree-aarch64/kernels/geometry/triangle.h new file mode 100644 index 00000000000..0dedf6dc4c1 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/triangle.h @@ -0,0 +1,162 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" + +namespace embree +{ + /* Precalculated representation for M triangles. Stores for each + triangle a base vertex, two edges, and the geometry normal to + speed up intersection calculations */ + template + struct TriangleM + { + public: + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* Returns maximum number of stored triangles */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + public: + + /* Default constructor */ + __forceinline TriangleM() {} + + /* Construction from vertices and IDs */ + __forceinline TriangleM(const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const vuint& geomIDs, const vuint& primIDs) + : v0(v0), e1(v0-v1), e2(v2-v0), geomIDs(geomIDs), primIDs(primIDs) {} + + /* Returns a mask that tells which triangles are valid */ + __forceinline vbool valid() const { return geomIDs != vuint(-1); } + + /* Returns true if the specified triangle is valid */ + __forceinline bool valid(const size_t i) const { assert(i& geomID() { return geomIDs; } + __forceinline const vuint& geomID() const { return geomIDs; } + __forceinline unsigned int geomID(const size_t i) const { assert(i& primID() { return primIDs; } + __forceinline const vuint& primID() const { return primIDs; } + __forceinline unsigned int primID(const size_t i) const { assert(i p0 = v0; + Vec3vf p1 = v0-e1; + Vec3vf p2 = v0+e2; + Vec3vf lower = min(p0,p1,p2); + Vec3vf upper = max(p0,p1,p2); + vbool mask = valid(); + lower.x = select(mask,lower.x,vfloat(pos_inf)); + lower.y = select(mask,lower.y,vfloat(pos_inf)); + lower.z = select(mask,lower.z,vfloat(pos_inf)); + upper.x = select(mask,upper.x,vfloat(neg_inf)); + upper.y = select(mask,upper.y,vfloat(neg_inf)); + upper.z = select(mask,upper.z,vfloat(neg_inf)); + return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), + Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); + } + + /* Non temporal store */ + __forceinline static void store_nt(TriangleM* dst, const TriangleM& src) + { + vfloat::store_nt(&dst->v0.x,src.v0.x); + vfloat::store_nt(&dst->v0.y,src.v0.y); + vfloat::store_nt(&dst->v0.z,src.v0.z); + vfloat::store_nt(&dst->e1.x,src.e1.x); + vfloat::store_nt(&dst->e1.y,src.e1.y); + vfloat::store_nt(&dst->e1.z,src.e1.z); + vfloat::store_nt(&dst->e2.x,src.e2.x); + vfloat::store_nt(&dst->e2.y,src.e2.y); + vfloat::store_nt(&dst->e2.z,src.e2.z); + vuint::store_nt(&dst->geomIDs,src.geomIDs); + vuint::store_nt(&dst->primIDs,src.primIDs); + } + + /* Fill triangle from triangle list */ + __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene) + { + vuint vgeomID = -1, vprimID = -1; + Vec3vf v0 = zero, v1 = zero, v2 = zero; + + for (size_t i=0; iget(geomID); + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + vgeomID [i] = geomID; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + } + TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID)); + } + + /* Updates the primitive */ + __forceinline BBox3fa update(TriangleMesh* mesh) + { + BBox3fa bounds = empty; + vuint vgeomID = -1, vprimID = -1; + Vec3vf v0 = zero, v1 = zero, v2 = zero; + + for (size_t i=0; itriangle(primId); + const Vec3fa p0 = mesh->vertex(tri.v[0]); + const Vec3fa p1 = mesh->vertex(tri.v[1]); + const Vec3fa p2 = mesh->vertex(tri.v[2]); + bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2))); + vgeomID [i] = geomId; + vprimID [i] = primId; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + } + TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID)); + return bounds; + } + + public: + Vec3vf v0; // base vertex of the triangles + Vec3vf e1; // 1st edge of the triangles (v0-v1) + Vec3vf e2; // 2nd edge of the triangles (v2-v0) + private: + vuint geomIDs; // geometry IDs + vuint primIDs; // primitive IDs + }; + + template + typename TriangleM::Type TriangleM::type; + + typedef TriangleM<4> Triangle4; +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h new file mode 100644 index 00000000000..125a42c5fee --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h @@ -0,0 +1,96 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "triangle.h" +#include "triangle_intersector_moeller.h" + +namespace embree +{ + namespace isa + { + /*! Intersects M triangles with 1 ray */ + template + struct TriangleMIntersector1Moeller + { + typedef TriangleM Primitive; + typedef MoellerTrumboreIntersector1 Precalculations; + + /*! Intersect a ray with the M triangles and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleM& tri) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleM& tri) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1::pointQuery(query, context, tri); + } + + }; + + /*! Intersects M triangles with K rays. */ + template + struct TriangleMIntersectorKMoeller + { + typedef TriangleM Primitive; + typedef MoellerTrumboreIntersectorK Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const TriangleM& tri) + { + STAT_USER(0,TriangleM::max_size()); + for (size_t i=0; i::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf p0 = broadcast>(tri.v0,i); + const Vec3vf e1 = broadcast>(tri.e1,i); + const Vec3vf e2 = broadcast>(tri.e2,i); + pre.intersectEdgeK(valid_i,ray,p0,e1,e2,IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const TriangleM& tri) + { + vbool valid0 = valid_i; + + for (size_t i=0; i::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + const Vec3vf p0 = broadcast>(tri.v0,i); + const Vec3vf e1 = broadcast>(tri.e1,i); + const Vec3vf e2 = broadcast>(tri.e2,i); + pre.intersectEdgeK(valid0,ray,p0,e1,e2,OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const TriangleM& tri) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const TriangleM& tri) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h new file mode 100644 index 00000000000..b5a8519236d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h @@ -0,0 +1,403 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "triangle.h" +#include "intersector_epilog.h" + +/*! This intersector implements a modified version of the Moeller + * Trumbore intersector from the paper "Fast, Minimum Storage + * Ray-Triangle Intersection". In contrast to the paper we + * precalculate some factors and factor the calculations differently + * to allow precalculating the cross product e1 x e2. The resulting + * algorithm is similar to the fastest one of the paper "Optimizing + * Ray-Triangle Intersection via Automated Search". */ + +namespace embree +{ + namespace isa + { + template + struct MoellerTrumboreHitM + { + __forceinline MoellerTrumboreHitM() {} + + __forceinline MoellerTrumboreHitM(const vbool& valid, const vfloat& U, const vfloat& V, const vfloat& T, const vfloat& absDen, const Vec3vf& Ng) + : U(U), V(V), T(T), absDen(absDen), valid(valid), vNg(Ng) {} + + __forceinline void finalize() + { + const vfloat rcpAbsDen = rcp(absDen); + vt = T * rcpAbsDen; + vu = U * rcpAbsDen; + vv = V * rcpAbsDen; + } + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + public: + vfloat U; + vfloat V; + vfloat T; + vfloat absDen; + + public: + vbool valid; + vfloat vu; + vfloat vv; + vfloat vt; + Vec3vf vNg; + }; + + template + struct MoellerTrumboreIntersector1 + { + __forceinline MoellerTrumboreIntersector1() {} + + __forceinline MoellerTrumboreIntersector1(const Ray& ray, const void* ptr) {} + + __forceinline bool intersect(const vbool& valid0, + Ray& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + const Vec3vf& tri_Ng, + MoellerTrumboreHitM& hit) const + { + /* calculate denominator */ + vbool valid = valid0; + const Vec3vf O = Vec3vf((Vec3fa)ray.org); + const Vec3vf D = Vec3vf((Vec3fa)ray.dir); + const Vec3vf C = Vec3vf(tri_v0) - O; + const Vec3vf R = cross(C,D); + const vfloat den = dot(Vec3vf(tri_Ng),D); + + const vfloat absDen = abs(den); + const vfloat sgnDen = signmsk(den); + + /* perform edge tests */ + const vfloat U = dot(R,Vec3vf(tri_e2)) ^ sgnDen; + const vfloat V = dot(R,Vec3vf(tri_e1)) ^ sgnDen; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + valid &= (den < vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#else + valid &= (den != vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#endif + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat T = dot(Vec3vf(tri_Ng),C) ^ sgnDen; + valid &= (absDen*vfloat(ray.tnear()) < T) & (T <= absDen*vfloat(ray.tfar)); + if (likely(none(valid))) return false; + + + /* update hit information */ + new (&hit) MoellerTrumboreHitM(valid,U,V,T,absDen,tri_Ng); + + return true; + } + + __forceinline bool intersectEdge(Ray& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + MoellerTrumboreHitM& hit) const + { + vbool valid = true; + const Vec3> tri_Ng = cross(tri_e2,tri_e1); + return intersect(valid,ray,tri_v0,tri_e1,tri_e2,tri_Ng,hit); + } + + __forceinline bool intersect(Ray& ray, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + MoellerTrumboreHitM& hit) const + { + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v2-v0; + return intersectEdge(ray,v0,e1,e2,hit); + } + + __forceinline bool intersect(const vbool& valid, + Ray& ray, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + MoellerTrumboreHitM& hit) const + { + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v2-v0; + return intersectEdge(valid,ray,v0,e1,e2,hit); + } + + template + __forceinline bool intersectEdge(Ray& ray, + const Vec3vf& v0, + const Vec3vf& e1, + const Vec3vf& e2, + const Epilog& epilog) const + { + MoellerTrumboreHitM hit; + if (likely(intersectEdge(ray,v0,e1,e2,hit))) return epilog(hit.valid,hit); + return false; + } + + template + __forceinline bool intersect(Ray& ray, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const Epilog& epilog) const + { + MoellerTrumboreHitM hit; + if (likely(intersect(ray,v0,v1,v2,hit))) return epilog(hit.valid,hit); + return false; + } + + template + __forceinline bool intersect(const vbool& valid, + Ray& ray, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const Epilog& epilog) const + { + MoellerTrumboreHitM hit; + if (likely(intersect(valid,ray,v0,v1,v2,hit))) return epilog(hit.valid,hit); + return false; + } + }; + + template + struct MoellerTrumboreHitK + { + __forceinline MoellerTrumboreHitK(const vfloat& U, const vfloat& V, const vfloat& T, const vfloat& absDen, const Vec3vf& Ng) + : U(U), V(V), T(T), absDen(absDen), Ng(Ng) {} + + __forceinline std::tuple,vfloat,vfloat,Vec3vf> operator() () const + { + const vfloat rcpAbsDen = rcp(absDen); + const vfloat t = T * rcpAbsDen; + const vfloat u = U * rcpAbsDen; + const vfloat v = V * rcpAbsDen; + return std::make_tuple(u,v,t,Ng); + } + + private: + const vfloat U; + const vfloat V; + const vfloat T; + const vfloat absDen; + const Vec3vf Ng; + }; + + template + struct MoellerTrumboreIntersectorK + { + __forceinline MoellerTrumboreIntersectorK(const vbool& valid, const RayK& ray) {} + + /*! Intersects K rays with one of M triangles. */ + template + __forceinline vbool intersectK(const vbool& valid0, + //RayK& ray, + const Vec3vf& ray_org, + const Vec3vf& ray_dir, + const vfloat& ray_tnear, + const vfloat& ray_tfar, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + const Vec3vf& tri_Ng, + const Epilog& epilog) const + { + /* calculate denominator */ + vbool valid = valid0; + const Vec3vf C = tri_v0 - ray_org; + const Vec3vf R = cross(C,ray_dir); + const vfloat den = dot(tri_Ng,ray_dir); + const vfloat absDen = abs(den); + const vfloat sgnDen = signmsk(den); + + /* test against edge p2 p0 */ + const vfloat U = dot(tri_e2,R) ^ sgnDen; + valid &= U >= 0.0f; + if (likely(none(valid))) return false; + + /* test against edge p0 p1 */ + const vfloat V = dot(tri_e1,R) ^ sgnDen; + valid &= V >= 0.0f; + if (likely(none(valid))) return false; + + /* test against edge p1 p2 */ + const vfloat W = absDen-U-V; + valid &= W >= 0.0f; + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat T = dot(tri_Ng,C) ^ sgnDen; + valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar); + if (unlikely(none(valid))) return false; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + valid &= den < vfloat(zero); + if (unlikely(none(valid))) return false; +#else + valid &= den != vfloat(zero); + if (unlikely(none(valid))) return false; +#endif + + /* calculate hit information */ + MoellerTrumboreHitK hit(U,V,T,absDen,tri_Ng); + return epilog(valid,hit); + } + + /*! Intersects K rays with one of M triangles. */ + template + __forceinline vbool intersectK(const vbool& valid0, + RayK& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_v1, + const Vec3vf& tri_v2, + const Epilog& epilog) const + { + const Vec3vf e1 = tri_v0-tri_v1; + const Vec3vf e2 = tri_v2-tri_v0; + const Vec3vf Ng = cross(e2,e1); + return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,epilog); + } + + /*! Intersects K rays with one of M triangles. */ + template + __forceinline vbool intersectEdgeK(const vbool& valid0, + RayK& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + const Epilog& epilog) const + { + const Vec3vf tri_Ng = cross(tri_e2,tri_e1); + return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,epilog); + } + + /*! Intersect k'th ray from ray packet of size K with M triangles. */ + __forceinline bool intersectEdge(RayK& ray, + size_t k, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + MoellerTrumboreHitM& hit) const + { + /* calculate denominator */ + typedef Vec3vf Vec3vfM; + const Vec3vf tri_Ng = cross(tri_e2,tri_e1); + + const Vec3vfM O = broadcast>(ray.org,k); + const Vec3vfM D = broadcast>(ray.dir,k); + const Vec3vfM C = Vec3vfM(tri_v0) - O; + const Vec3vfM R = cross(C,D); + const vfloat den = dot(Vec3vfM(tri_Ng),D); + const vfloat absDen = abs(den); + const vfloat sgnDen = signmsk(den); + + /* perform edge tests */ + const vfloat U = dot(Vec3vf(tri_e2),R) ^ sgnDen; + const vfloat V = dot(Vec3vf(tri_e1),R) ^ sgnDen; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + vbool valid = (den < vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#else + vbool valid = (den != vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#endif + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat T = dot(Vec3vf(tri_Ng),C) ^ sgnDen; + valid &= (absDen*vfloat(ray.tnear()[k]) < T) & (T <= absDen*vfloat(ray.tfar[k])); + if (likely(none(valid))) return false; + + /* calculate hit information */ + new (&hit) MoellerTrumboreHitM(valid,U,V,T,absDen,tri_Ng); + return true; + } + + __forceinline bool intersectEdge(RayK& ray, + size_t k, + const BBox>& time_range, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + MoellerTrumboreHitM& hit) const + { + if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,hit))) + { + hit.valid &= time_range.lower <= vfloat(ray.time[k]); + hit.valid &= vfloat(ray.time[k]) < time_range.upper; + return any(hit.valid); + } + return false; + } + + template + __forceinline bool intersectEdge(RayK& ray, + size_t k, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + const Epilog& epilog) const + { + MoellerTrumboreHitM hit; + if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit); + return false; + } + + template + __forceinline bool intersectEdge(RayK& ray, + size_t k, + const BBox>& time_range, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + const Epilog& epilog) const + { + MoellerTrumboreHitM hit; + if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit); + return false; + } + + template + __forceinline bool intersect(RayK& ray, + size_t k, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const Epilog& epilog) const + { + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v2-v0; + return intersectEdge(ray,k,v0,e1,e2,epilog); + } + + template + __forceinline bool intersect(RayK& ray, + size_t k, + const BBox>& time_range, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const Epilog& epilog) const + { + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v2-v0; + return intersectEdge(ray,k,time_range,v0,e1,e2,epilog); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h new file mode 100644 index 00000000000..f1de99d2087 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h @@ -0,0 +1,247 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "triangle.h" +#include "trianglev.h" +#include "trianglev_mb.h" +#include "intersector_epilog.h" + +/*! Modified Pluecker ray/triangle intersector. The test first shifts + * the ray origin into the origin of the coordinate system and then + * uses Pluecker coordinates for the intersection. Due to the shift, + * the Pluecker coordinate calculation simplifies and the tests get + * numerically stable. The edge equations are watertight along the + * edge for neighboring triangles. */ + +namespace embree +{ + namespace isa + { + template + struct PlueckerHitM + { + __forceinline PlueckerHitM(const vfloat& U, const vfloat& V, const vfloat& UVW, const vfloat& t, const Vec3vf& Ng, const UVMapper& mapUV) + : U(U), V(V), UVW(UVW), mapUV(mapUV), vt(t), vNg(Ng) {} + + __forceinline void finalize() + { + const vbool invalid = abs(UVW) < min_rcp_input; + const vfloat rcpUVW = select(invalid,vfloat(0.0f),rcp(UVW)); + vu = U * rcpUVW; + vv = V * rcpUVW; + mapUV(vu,vv); + } + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + private: + const vfloat U; + const vfloat V; + const vfloat UVW; + const UVMapper& mapUV; + + public: + vfloat vu; + vfloat vv; + vfloat vt; + Vec3vf vNg; + }; + + template + struct PlueckerIntersector1 + { + __forceinline PlueckerIntersector1() {} + + __forceinline PlueckerIntersector1(const Ray& ray, const void* ptr) {} + + template + __forceinline bool intersect(Ray& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_v1, + const Vec3vf& tri_v2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + /* calculate vertices relative to ray origin */ + const Vec3vf O = Vec3vf((Vec3fa)ray.org); + const Vec3vf D = Vec3vf((Vec3fa)ray.dir); + const Vec3vf v0 = tri_v0-O; + const Vec3vf v1 = tri_v1-O; + const Vec3vf v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf e0 = v2-v0; + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v1-v2; + + /* perform edge tests */ + const vfloat U = dot(cross(e0,v2+v0),D); + const vfloat V = dot(cross(e1,v0+v1),D); + const vfloat W = dot(cross(e2,v1+v2),D); + const vfloat UVW = U+V+W; + const vfloat eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + vbool valid = max(U,V,W) <= eps; +#else + vbool valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); + const vfloat den = twice(dot(Ng,D)); + + /* perform depth test */ + const vfloat T = twice(dot(v0,Ng)); + const vfloat t = rcp(den)*T; + valid &= vfloat(ray.tnear()) <= t & t <= vfloat(ray.tfar); + valid &= den != vfloat(zero); + if (unlikely(none(valid))) return false; + + /* update hit information */ + PlueckerHitM hit(U,V,UVW,t,Ng,mapUV); + return epilog(valid,hit); + } + }; + + template + struct PlueckerHitK + { + __forceinline PlueckerHitK(const vfloat& U, const vfloat& V, const vfloat& UVW, const vfloat& t, const Vec3vf& Ng, const UVMapper& mapUV) + : U(U), V(V), UVW(UVW), t(t), Ng(Ng), mapUV(mapUV) {} + + __forceinline std::tuple,vfloat,vfloat,Vec3vf> operator() () const + { + const vbool invalid = abs(UVW) < min_rcp_input; + const vfloat rcpUVW = select(invalid,vfloat(0.0f),rcp(UVW)); + vfloat u = U * rcpUVW; + vfloat v = V * rcpUVW; + mapUV(u,v); + return std::make_tuple(u,v,t,Ng); + } + + private: + const vfloat U; + const vfloat V; + const vfloat UVW; + const vfloat t; + const Vec3vf Ng; + const UVMapper& mapUV; + }; + + template + struct PlueckerIntersectorK + { + __forceinline PlueckerIntersectorK(const vbool& valid, const RayK& ray) {} + + /*! Intersects K rays with one of M triangles. */ + template + __forceinline vbool intersectK(const vbool& valid0, + RayK& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_v1, + const Vec3vf& tri_v2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + /* calculate vertices relative to ray origin */ + vbool valid = valid0; + const Vec3vf O = ray.org; + const Vec3vf D = ray.dir; + const Vec3vf v0 = tri_v0-O; + const Vec3vf v1 = tri_v1-O; + const Vec3vf v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf e0 = v2-v0; + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v1-v2; + + /* perform edge tests */ + const vfloat U = dot(Vec3vf(cross(e0,v2+v0)),D); + const vfloat V = dot(Vec3vf(cross(e1,v0+v1)),D); + const vfloat W = dot(Vec3vf(cross(e2,v1+v2)),D); + const vfloat UVW = U+V+W; + const vfloat eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + valid &= max(U,V,W) <= eps; +#else + valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); + const vfloat den = twice(dot(Vec3vf(Ng),D)); + + /* perform depth test */ + const vfloat T = twice(dot(v0,Vec3vf(Ng))); + const vfloat t = rcp(den)*T; + valid &= ray.tnear() <= t & t <= ray.tfar; + valid &= den != vfloat(zero); + if (unlikely(none(valid))) return false; + + /* calculate hit information */ + PlueckerHitK hit(U,V,UVW,t,Ng,mapUV); + return epilog(valid,hit); + } + + /*! Intersect k'th ray from ray packet of size K with M triangles. */ + template + __forceinline bool intersect(RayK& ray, size_t k, + const Vec3vf& tri_v0, + const Vec3vf& tri_v1, + const Vec3vf& tri_v2, + const UVMapper& mapUV, + const Epilog& epilog) const + { + /* calculate vertices relative to ray origin */ + const Vec3vf O = broadcast>(ray.org,k); + const Vec3vf D = broadcast>(ray.dir,k); + const Vec3vf v0 = tri_v0-O; + const Vec3vf v1 = tri_v1-O; + const Vec3vf v2 = tri_v2-O; + + /* calculate triangle edges */ + const Vec3vf e0 = v2-v0; + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v1-v2; + + /* perform edge tests */ + const vfloat U = dot(cross(e0,v2+v0),D); + const vfloat V = dot(cross(e1,v0+v1),D); + const vfloat W = dot(cross(e2,v1+v2),D); + const vfloat UVW = U+V+W; + const vfloat eps = float(ulp)*abs(UVW); +#if defined(EMBREE_BACKFACE_CULLING) + vbool valid = max(U,V,W) <= eps; +#else + vbool valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps); +#endif + if (unlikely(none(valid))) return false; + + /* calculate geometry normal and denominator */ + const Vec3vf Ng = stable_triangle_normal(e0,e1,e2); + const vfloat den = twice(dot(Ng,D)); + + /* perform depth test */ + const vfloat T = twice(dot(v0,Ng)); + const vfloat t = rcp(den)*T; + valid &= vfloat(ray.tnear()[k]) <= t & t <= vfloat(ray.tfar[k]); + if (unlikely(none(valid))) return false; + + /* avoid division by 0 */ + valid &= den != vfloat(zero); + if (unlikely(none(valid))) return false; + + /* update hit information */ + PlueckerHitM hit(U,V,UVW,t,Ng,mapUV); + return epilog(valid,hit); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h new file mode 100644 index 00000000000..63e649d8fb6 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h @@ -0,0 +1,418 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "triangle.h" +#include "intersector_epilog.h" + +/*! This intersector implements a modified version of the Woop's ray-triangle intersection test */ + +namespace embree +{ + namespace isa + { + template + struct WoopHitM + { + __forceinline WoopHitM() {} + + __forceinline WoopHitM(const vbool& valid, + const vfloat& U, + const vfloat& V, + const vfloat& T, + const vfloat& inv_det, + const Vec3vf& Ng) + : U(U), V(V), T(T), inv_det(inv_det), valid(valid), vNg(Ng) {} + + __forceinline void finalize() + { + vt = T; + vu = U*inv_det; + vv = V*inv_det; + } + + __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); } + __forceinline float t (const size_t i) const { return vt[i]; } + __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); } + + private: + const vfloat U; + const vfloat V; + const vfloat T; + const vfloat inv_det; + + public: + const vbool valid; + vfloat vu; + vfloat vv; + vfloat vt; + Vec3vf vNg; + }; + + template + struct WoopPrecalculations1 + { + unsigned int kx,ky,kz; + Vec3vf org; + Vec3fa S; + __forceinline WoopPrecalculations1() {} + + __forceinline WoopPrecalculations1(const Ray& ray, const void* ptr) + { + kz = maxDim(abs(ray.dir)); + kx = (kz+1) % 3; + ky = (kx+1) % 3; + const float inv_dir_kz = rcp(ray.dir[kz]); + if (ray.dir[kz]) std::swap(kx,ky); + S.x = ray.dir[kx] * inv_dir_kz; + S.y = ray.dir[ky] * inv_dir_kz; + S.z = inv_dir_kz; + org = Vec3vf(ray.org[kx],ray.org[ky],ray.org[kz]); + } + }; + + + template + struct WoopIntersector1 + { + + typedef WoopPrecalculations1 Precalculations; + + __forceinline WoopIntersector1() {} + + __forceinline WoopIntersector1(const Ray& ray, const void* ptr) {} + + static __forceinline bool intersect(const vbool& valid0, + Ray& ray, + const Precalculations& pre, + const Vec3vf& tri_v0, + const Vec3vf& tri_v1, + const Vec3vf& tri_v2, + WoopHitM& hit) + { + vbool valid = valid0; + + /* vertices relative to ray origin */ + const Vec3vf org = Vec3vf(pre.org.x,pre.org.y,pre.org.z); + const Vec3vf A = Vec3vf(tri_v0[pre.kx],tri_v0[pre.ky],tri_v0[pre.kz]) - org; + const Vec3vf B = Vec3vf(tri_v1[pre.kx],tri_v1[pre.ky],tri_v1[pre.kz]) - org; + const Vec3vf C = Vec3vf(tri_v2[pre.kx],tri_v2[pre.ky],tri_v2[pre.kz]) - org; + + /* shear and scale vertices */ + const vfloat Ax = nmadd(A.z,pre.S.x,A.x); + const vfloat Ay = nmadd(A.z,pre.S.y,A.y); + const vfloat Bx = nmadd(B.z,pre.S.x,B.x); + const vfloat By = nmadd(B.z,pre.S.y,B.y); + const vfloat Cx = nmadd(C.z,pre.S.x,C.x); + const vfloat Cy = nmadd(C.z,pre.S.y,C.y); + + /* scaled barycentric */ + const vfloat U0 = Cx*By; + const vfloat U1 = Cy*Bx; + const vfloat V0 = Ax*Cy; + const vfloat V1 = Ay*Cx; + const vfloat W0 = Bx*Ay; + const vfloat W1 = By*Ax; +#if !defined(__AVX512F__) + valid &= (U0 >= U1) & (V0 >= V1) & (W0 >= W1) | + (U0 <= U1) & (V0 <= V1) & (W0 <= W1); +#else + valid &= ge(ge(U0 >= U1,V0,V1),W0,W1) | le(le(U0 <= U1,V0,V1),W0,W1); +#endif + + if (likely(none(valid))) return false; + const vfloat U = U0-U1; + const vfloat V = V0-V1; + const vfloat W = W0-W1; + + const vfloat det = U+V+W; + + valid &= det != 0.0f; + const vfloat inv_det = rcp(det); + + const vfloat Az = pre.S.z * A.z; + const vfloat Bz = pre.S.z * B.z; + const vfloat Cz = pre.S.z * C.z; + const vfloat T = madd(U,Az,madd(V,Bz,W*Cz)); + const vfloat t = T * inv_det; + /* perform depth test */ + valid &= (vfloat(ray.tnear()) < t) & (t <= vfloat(ray.tfar)); + if (likely(none(valid))) return false; + + const Vec3vf tri_Ng = cross(tri_v2-tri_v0,tri_v0-tri_v1); + + /* update hit information */ + new (&hit) WoopHitM(valid,U,V,t,inv_det,tri_Ng); + return true; + } + + static __forceinline bool intersect(Ray& ray, + const Precalculations& pre, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + WoopHitM& hit) + { + vbool valid = true; + return intersect(valid,ray,pre,v0,v1,v2,hit); + } + + + template + static __forceinline bool intersect(Ray& ray, + const Precalculations& pre, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const Epilog& epilog) + { + WoopHitM hit; + if (likely(intersect(ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit); + return false; + } + + template + static __forceinline bool intersect(const vbool& valid, + Ray& ray, + const Precalculations& pre, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const Epilog& epilog) + { + WoopHitM hit; + if (likely(intersect(valid,ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit); + return false; + } + }; + +#if 0 + template + struct WoopHitK + { + __forceinline WoopHitK(const vfloat& U, const vfloat& V, const vfloat& T, const vfloat& absDen, const Vec3vf& Ng) + : U(U), V(V), T(T), absDen(absDen), Ng(Ng) {} + + __forceinline std::tuple,vfloat,vfloat,Vec3vf> operator() () const + { + const vfloat rcpAbsDen = rcp(absDen); + const vfloat t = T * rcpAbsDen; + const vfloat u = U * rcpAbsDen; + const vfloat v = V * rcpAbsDen; + return std::make_tuple(u,v,t,Ng); + } + + private: + const vfloat U; + const vfloat V; + const vfloat T; + const vfloat absDen; + const Vec3vf Ng; + }; + + template + struct WoopIntersectorK + { + __forceinline WoopIntersectorK(const vbool& valid, const RayK& ray) {} + + /*! Intersects K rays with one of M triangles. */ + template + __forceinline vbool intersectK(const vbool& valid0, + //RayK& ray, + const Vec3vf& ray_org, + const Vec3vf& ray_dir, + const vfloat& ray_tnear, + const vfloat& ray_tfar, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + const Vec3vf& tri_Ng, + const Epilog& epilog) const + { + /* calculate denominator */ + vbool valid = valid0; + const Vec3vf C = tri_v0 - ray_org; + const Vec3vf R = cross(C,ray_dir); + const vfloat den = dot(tri_Ng,ray_dir); + const vfloat absDen = abs(den); + const vfloat sgnDen = signmsk(den); + + /* test against edge p2 p0 */ + const vfloat U = dot(tri_e2,R) ^ sgnDen; + valid &= U >= 0.0f; + if (likely(none(valid))) return false; + + /* test against edge p0 p1 */ + const vfloat V = dot(tri_e1,R) ^ sgnDen; + valid &= V >= 0.0f; + if (likely(none(valid))) return false; + + /* test against edge p1 p2 */ + const vfloat W = absDen-U-V; + valid &= W >= 0.0f; + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat T = dot(tri_Ng,C) ^ sgnDen; + valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar); + if (unlikely(none(valid))) return false; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + valid &= den < vfloat(zero); + if (unlikely(none(valid))) return false; +#else + valid &= den != vfloat(zero); + if (unlikely(none(valid))) return false; +#endif + + /* calculate hit information */ + WoopHitK hit(U,V,T,absDen,tri_Ng); + return epilog(valid,hit); + } + + /*! Intersects K rays with one of M triangles. */ + template + __forceinline vbool intersectK(const vbool& valid0, + RayK& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_v1, + const Vec3vf& tri_v2, + const Epilog& epilog) const + { + const Vec3vf e1 = tri_v0-tri_v1; + const Vec3vf e2 = tri_v2-tri_v0; + const Vec3vf Ng = cross(e2,e1); + return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,epilog); + } + + /*! Intersects K rays with one of M triangles. */ + template + __forceinline vbool intersectEdgeK(const vbool& valid0, + RayK& ray, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + const Epilog& epilog) const + { + const Vec3vf tri_Ng = cross(tri_e2,tri_e1); + return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,epilog); + } + + /*! Intersect k'th ray from ray packet of size K with M triangles. */ + __forceinline bool intersectEdge(RayK& ray, + size_t k, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + WoopHitM& hit) const + { + /* calculate denominator */ + typedef Vec3vf Vec3vfM; + const Vec3vf tri_Ng = cross(tri_e2,tri_e1); + + const Vec3vfM O = broadcast>(ray.org,k); + const Vec3vfM D = broadcast>(ray.dir,k); + const Vec3vfM C = Vec3vfM(tri_v0) - O; + const Vec3vfM R = cross(C,D); + const vfloat den = dot(Vec3vfM(tri_Ng),D); + const vfloat absDen = abs(den); + const vfloat sgnDen = signmsk(den); + + /* perform edge tests */ + const vfloat U = dot(Vec3vf(tri_e2),R) ^ sgnDen; + const vfloat V = dot(Vec3vf(tri_e1),R) ^ sgnDen; + + /* perform backface culling */ +#if defined(EMBREE_BACKFACE_CULLING) + vbool valid = (den < vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#else + vbool valid = (den != vfloat(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen); +#endif + if (likely(none(valid))) return false; + + /* perform depth test */ + const vfloat T = dot(Vec3vf(tri_Ng),C) ^ sgnDen; + valid &= (absDen*vfloat(ray.tnear()[k]) < T) & (T <= absDen*vfloat(ray.tfar[k])); + if (likely(none(valid))) return false; + + /* calculate hit information */ + new (&hit) WoopHitM(valid,U,V,T,absDen,tri_Ng); + return true; + } + + __forceinline bool intersectEdge(RayK& ray, + size_t k, + const BBox>& time_range, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + WoopHitM& hit) const + { + if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,hit))) + { + hit.valid &= time_range.lower <= vfloat(ray.time[k]); + hit.valid &= vfloat(ray.time[k]) < time_range.upper; + return any(hit.valid); + } + return false; + } + + template + __forceinline bool intersectEdge(RayK& ray, + size_t k, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + const Epilog& epilog) const + { + WoopHitM hit; + if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit); + return false; + } + + template + __forceinline bool intersectEdge(RayK& ray, + size_t k, + const BBox>& time_range, + const Vec3vf& tri_v0, + const Vec3vf& tri_e1, + const Vec3vf& tri_e2, + const Epilog& epilog) const + { + WoopHitM hit; + if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit); + return false; + } + + template + __forceinline bool intersect(RayK& ray, + size_t k, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const Epilog& epilog) const + { + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v2-v0; + return intersectEdge(ray,k,v0,e1,e2,epilog); + } + + template + __forceinline bool intersect(RayK& ray, + size_t k, + const BBox>& time_range, + const Vec3vf& v0, + const Vec3vf& v1, + const Vec3vf& v2, + const Epilog& epilog) const + { + const Vec3vf e1 = v0-v1; + const Vec3vf e2 = v2-v0; + return intersectEdge(ray,k,time_range,v0,e1,e2,epilog); + } + }; +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h new file mode 100644 index 00000000000..91b35c36f39 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h @@ -0,0 +1,132 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "primitive.h" + +namespace embree +{ + namespace isa + { + struct TriangleTriangleIntersector + { + __forceinline static float T(float pa0, float pa1, float da0, float da1) { + return pa0 + (pa1-pa0)*da0/(da0-da1); + } + + __forceinline static bool point_line_side(const Vec2f& p, const Vec2f& a0, const Vec2f& a1) { + return det(p-a0,a0-a1) >= 0.0f; + } + + __forceinline static bool point_inside_triangle(const Vec2f& p, const Vec2f& a, const Vec2f& b, const Vec2f& c) + { + const bool pab = point_line_side(p,a,b); + const bool pbc = point_line_side(p,b,c); + const bool pca = point_line_side(p,c,a); + return pab == pbc && pab == pca; + } + + __forceinline static bool intersect_line_line(const Vec2f& a0, const Vec2f& a1, const Vec2f& b0, const Vec2f& b1) + { + const bool different_sides0 = point_line_side(b0,a0,a1) != point_line_side(b1,a0,a1); + const bool different_sides1 = point_line_side(a0,b0,b1) != point_line_side(a1,b0,b1); + return different_sides0 && different_sides1; + } + + __forceinline static bool intersect_triangle_triangle (const Vec2f& a0, const Vec2f& a1, const Vec2f& a2, + const Vec2f& b0, const Vec2f& b1, const Vec2f& b2) + { + const bool a01_b01 = intersect_line_line(a0,a1,b0,b1); + if (a01_b01) return true; + const bool a01_b12 = intersect_line_line(a0,a1,b1,b2); + if (a01_b12) return true; + const bool a01_b20 = intersect_line_line(a0,a1,b2,b0); + if (a01_b20) return true; + const bool a12_b01 = intersect_line_line(a1,a2,b0,b1); + if (a12_b01) return true; + const bool a12_b12 = intersect_line_line(a1,a2,b1,b2); + if (a12_b12) return true; + const bool a12_b20 = intersect_line_line(a1,a2,b2,b0); + if (a12_b20) return true; + const bool a20_b01 = intersect_line_line(a2,a0,b0,b1); + if (a20_b01) return true; + const bool a20_b12 = intersect_line_line(a2,a0,b1,b2); + if (a20_b12) return true; + const bool a20_b20 = intersect_line_line(a2,a0,b2,b0); + if (a20_b20) return true; + + bool a_in_b = point_inside_triangle(a0,b0,b1,b2) && point_inside_triangle(a1,b0,b1,b2) && point_inside_triangle(a2,b0,b1,b2); + if (a_in_b) return true; + + bool b_in_a = point_inside_triangle(b0,a0,a1,a2) && point_inside_triangle(b1,a0,a1,a2) && point_inside_triangle(b2,a0,a1,a2); + if (b_in_a) return true; + + return false; + } + + static bool intersect_triangle_triangle (const Vec3fa& a0, const Vec3fa& a1, const Vec3fa& a2, + const Vec3fa& b0, const Vec3fa& b1, const Vec3fa& b2) + { + const float eps = 1E-5f; + + /* calculate triangle planes */ + const Vec3fa Na = cross(a1-a0,a2-a0); + const float Ca = dot(Na,a0); + const Vec3fa Nb = cross(b1-b0,b2-b0); + const float Cb = dot(Nb,b0); + + /* project triangle A onto plane B */ + const float da0 = dot(Nb,a0)-Cb; + const float da1 = dot(Nb,a1)-Cb; + const float da2 = dot(Nb,a2)-Cb; + if (max(da0,da1,da2) < -eps) return false; + if (min(da0,da1,da2) > +eps) return false; + //CSTAT(bvh_collide_prim_intersections4++); + + /* project triangle B onto plane A */ + const float db0 = dot(Na,b0)-Ca; + const float db1 = dot(Na,b1)-Ca; + const float db2 = dot(Na,b2)-Ca; + if (max(db0,db1,db2) < -eps) return false; + if (min(db0,db1,db2) > +eps) return false; + //CSTAT(bvh_collide_prim_intersections5++); + + if (unlikely((std::fabs(da0) < eps && std::fabs(da1) < eps && std::fabs(da2) < eps) || + (std::fabs(db0) < eps && std::fabs(db1) < eps && std::fabs(db2) < eps))) + { + const size_t dz = maxDim(Na); + const size_t dx = (dz+1)%3; + const size_t dy = (dx+1)%3; + const Vec2f A0(a0[dx],a0[dy]); + const Vec2f A1(a1[dx],a1[dy]); + const Vec2f A2(a2[dx],a2[dy]); + const Vec2f B0(b0[dx],b0[dy]); + const Vec2f B1(b1[dx],b1[dy]); + const Vec2f B2(b2[dx],b2[dy]); + return intersect_triangle_triangle(A0,A1,A2,B0,B1,B2); + } + + const Vec3fa D = cross(Na,Nb); + const float pa0 = dot(D,a0); + const float pa1 = dot(D,a1); + const float pa2 = dot(D,a2); + const float pb0 = dot(D,b0); + const float pb1 = dot(D,b1); + const float pb2 = dot(D,b2); + + BBox1f ba = empty; + if (min(da0,da1) <= 0.0f && max(da0,da1) >= 0.0f && abs(da0-da1) > 0.0f) ba.extend(T(pa0,pa1,da0,da1)); + if (min(da1,da2) <= 0.0f && max(da1,da2) >= 0.0f && abs(da1-da2) > 0.0f) ba.extend(T(pa1,pa2,da1,da2)); + if (min(da2,da0) <= 0.0f && max(da2,da0) >= 0.0f && abs(da2-da0) > 0.0f) ba.extend(T(pa2,pa0,da2,da0)); + + BBox1f bb = empty; + if (min(db0,db1) <= 0.0f && max(db0,db1) >= 0.0f && abs(db0-db1) > 0.0f) bb.extend(T(pb0,pb1,db0,db1)); + if (min(db1,db2) <= 0.0f && max(db1,db2) >= 0.0f && abs(db1-db2) > 0.0f) bb.extend(T(pb1,pb2,db1,db2)); + if (min(db2,db0) <= 0.0f && max(db2,db0) >= 0.0f && abs(db2-db0) > 0.0f) bb.extend(T(pb2,pb0,db2,db0)); + + return conjoint(ba,bb); + } + }; + } +} + + diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglei.h b/thirdparty/embree-aarch64/kernels/geometry/trianglei.h new file mode 100644 index 00000000000..4f3118cc0cc --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/trianglei.h @@ -0,0 +1,442 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" +#include "../common/scene.h" + +namespace embree +{ + /* Stores M triangles from an indexed face set */ + template + struct TriangleMi + { + /* Virtual interface to query information about the triangle type */ + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* primitive supports multiple time segments */ + static const bool singleTimeSegment = false; + + /* Returns maximum number of stored triangles */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + public: + + /* Default constructor */ + __forceinline TriangleMi() { } + + /* Construction from vertices and IDs */ + __forceinline TriangleMi(const vuint& v0, + const vuint& v1, + const vuint& v2, + const vuint& geomIDs, + const vuint& primIDs) +#if defined(EMBREE_COMPACT_POLYS) + : geomIDs(geomIDs), primIDs(primIDs) {} +#else + : v0_(v0), v1_(v1), v2_(v2), geomIDs(geomIDs), primIDs(primIDs) {} +#endif + + /* Returns a mask that tells which triangles are valid */ + __forceinline vbool valid() const { return primIDs != vuint(-1); } + + /* Returns if the specified triangle is valid */ + __forceinline bool valid(const size_t i) const { assert(i geomID() const { return geomIDs; } + __forceinline unsigned int geomID(const size_t i) const { assert(i primID() const { return primIDs; } + __forceinline unsigned int primID(const size_t i) const { assert(iget(geomID(i)); + bounds.extend(mesh->bounds(primID(i),itime)); + } + return bounds; + } + + /* Calculate the linear bounds of the primitive */ + __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime) { + return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1)); + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) + { + LBBox3fa allBounds = empty; + for (size_t i=0; iget(geomID(i)); + allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps)); + } + return allBounds; + } + + __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) + { + LBBox3fa allBounds = empty; + for (size_t i=0; iget(geomID(i)); + allBounds.extend(mesh->linearBounds(primID(i), time_range)); + } + return allBounds; + } + + /* Non-temporal store */ + __forceinline static void store_nt(TriangleMi* dst, const TriangleMi& src) + { +#if !defined(EMBREE_COMPACT_POLYS) + vuint::store_nt(&dst->v0_,src.v0_); + vuint::store_nt(&dst->v1_,src.v1_); + vuint::store_nt(&dst->v2_,src.v2_); +#endif + vuint::store_nt(&dst->geomIDs,src.geomIDs); + vuint::store_nt(&dst->primIDs,src.primIDs); + } + + /* Fill triangle from triangle list */ + template + __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene) + { + vuint v0 = zero, v1 = zero, v2 = zero; + vuint geomID = -1, primID = -1; + const PrimRefT* prim = &prims[begin]; + + for (size_t i=0; igeomID(); + primID[i] = prim->primID(); +#if !defined(EMBREE_COMPACT_POLYS) + const TriangleMesh* mesh = scene->get(prim->geomID()); + const TriangleMesh::Triangle& tri = mesh->triangle(prim->primID()); + unsigned int int_stride = mesh->vertices0.getStride()/4; + v0[i] = tri.v[0] * int_stride; + v1[i] = tri.v[1] * int_stride; + v2[i] = tri.v[2] * int_stride; +#endif + begin++; + } else { + assert(i); + if (likely(i > 0)) { + geomID[i] = geomID[0]; + primID[i] = -1; + v0[i] = v0[0]; + v1[i] = v0[0]; + v2[i] = v0[0]; + } + } + if (begintriangle(primId); + const Vec3fa p0 = mesh->vertex(tri.v[0]); + const Vec3fa p1 = mesh->vertex(tri.v[1]); + const Vec3fa p2 = mesh->vertex(tri.v[2]); + bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2))); + } + return bounds; + } + + protected: +#if !defined(EMBREE_COMPACT_POLYS) + vuint v0_; // 4 byte offset of 1st vertex + vuint v1_; // 4 byte offset of 2nd vertex + vuint v2_; // 4 byte offset of 3rd vertex +#endif + vuint geomIDs; // geometry ID of mesh + vuint primIDs; // primitive ID of primitive inside mesh + }; + + namespace isa + { + + template + struct TriangleMi : public embree::TriangleMi + { +#if !defined(EMBREE_COMPACT_POLYS) + using embree::TriangleMi::v0_; + using embree::TriangleMi::v1_; + using embree::TriangleMi::v2_; +#endif + using embree::TriangleMi::geomIDs; + using embree::TriangleMi::primIDs; + using embree::TriangleMi::geomID; + using embree::TriangleMi::primID; + using embree::TriangleMi::valid; + + /* loads a single vertex */ + template + __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const + { +#if defined(EMBREE_COMPACT_POLYS) + const TriangleMesh* mesh = scene->get(geomID(index)); + const TriangleMesh::Triangle& tri = mesh->triangle(primID(index)); + return (Vec3f) mesh->vertices[0][tri.v[vid]]; +#else + const vuint& v = getVertexOffset(); + const float* vertices = scene->vertices[geomID(index)]; + return (Vec3f&) vertices[v[index]]; +#endif + } + + template + __forceinline Vec3 getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const + { +#if defined(EMBREE_COMPACT_POLYS) + const TriangleMesh* mesh = scene->get(geomID(index)); + const TriangleMesh::Triangle& tri = mesh->triangle(primID(index)); + const Vec3fa v0 = mesh->vertices[itime+0][tri.v[vid]]; + const Vec3fa v1 = mesh->vertices[itime+1][tri.v[vid]]; +#else + const vuint& v = getVertexOffset(); + const TriangleMesh* mesh = scene->get(geomID(index)); + const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0); + const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1); + const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]); + const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]); +#endif + const Vec3 p0(v0.x,v0.y,v0.z); + const Vec3 p1(v1.x,v1.y,v1.z); + return lerp(p0,p1,ftime); + } + + template + __forceinline Vec3 getVertex(const vbool& valid, const size_t index, const Scene *const scene, const vint& itime, const T& ftime) const + { + Vec3 p0, p1; + const TriangleMesh* mesh = scene->get(geomID(index)); + + for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask)) + { +#if defined(EMBREE_COMPACT_POLYS) + const TriangleMesh::Triangle& tri = mesh->triangle(primID(index)); + const Vec3fa v0 = mesh->vertices[itime[i]+0][tri.v[vid]]; + const Vec3fa v1 = mesh->vertices[itime[i]+1][tri.v[vid]]; +#else + const vuint& v = getVertexOffset(); + const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0); + const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1); + const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]); + const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]); +#endif + p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z; + p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z; + } + return (T(one)-ftime)*p0 + ftime*p1; + } + + struct Triangle { + vfloat4 v0,v1,v2; + }; + +#if defined(EMBREE_COMPACT_POLYS) + + __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const + { + const unsigned int geomID = geomIDs[i]; + const unsigned int primID = primIDs[i]; + if (unlikely(primID == -1)) return { zero, zero, zero }; + const TriangleMesh* mesh = scene->get(geomID); + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const vfloat4 v0 = (vfloat4) mesh->vertices0[tri.v[0]]; + const vfloat4 v1 = (vfloat4) mesh->vertices0[tri.v[1]]; + const vfloat4 v2 = (vfloat4) mesh->vertices0[tri.v[2]]; + return { v0, v1, v2 }; + } + + __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const + { + const unsigned int primID = primIDs[i]; + if (unlikely(primID == -1)) return { zero, zero, zero }; + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const vfloat4 v0 = (vfloat4) mesh->vertices[itime][tri.v[0]]; + const vfloat4 v1 = (vfloat4) mesh->vertices[itime][tri.v[1]]; + const vfloat4 v2 = (vfloat4) mesh->vertices[itime][tri.v[2]]; + return { v0, v1, v2 }; + } + +#else + + __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const + { + const float* vertices = scene->vertices[geomID(i)]; + const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]); + const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]); + const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]); + return { v0, v1, v2 }; + } + + __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const + { + const float* vertices = (const float*) mesh->vertexPtr(0,itime); + const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]); + const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]); + const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]); + return { v0, v1, v2 }; + } + +#endif + + /* Gather the triangles */ + __forceinline void gather(Vec3vf& p0, Vec3vf& p1, Vec3vf& p2, const Scene* const scene) const; + + template +#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019 + __noinline +#else + __forceinline +#endif + void gather(const vbool& valid, + Vec3vf& p0, + Vec3vf& p1, + Vec3vf& p2, + const size_t index, + const Scene* const scene, + const vfloat& time) const + { + const TriangleMesh* mesh = scene->get(geomID(index)); + + vfloat ftime; + const vint itime = mesh->timeSegment(time, ftime); + + const size_t first = bsf(movemask(valid)); + if (likely(all(valid,itime[first] == itime))) + { + p0 = getVertex<0>(index, scene, itime[first], ftime); + p1 = getVertex<1>(index, scene, itime[first], ftime); + p2 = getVertex<2>(index, scene, itime[first], ftime); + } else { + p0 = getVertex<0>(valid, index, scene, itime, ftime); + p1 = getVertex<1>(valid, index, scene, itime, ftime); + p2 = getVertex<2>(valid, index, scene, itime, ftime); + } + } + + __forceinline void gather(Vec3vf& p0, + Vec3vf& p1, + Vec3vf& p2, + const TriangleMesh* mesh, + const Scene *const scene, + const int itime) const; + + __forceinline void gather(Vec3vf& p0, + Vec3vf& p1, + Vec3vf& p2, + const Scene *const scene, + const float time) const; + + +#if !defined(EMBREE_COMPACT_POLYS) + template const vuint& getVertexOffset() const; +#endif + }; + +#if !defined(EMBREE_COMPACT_POLYS) + template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<0>() const { return v0_; } + template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<1>() const { return v1_; } + template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<2>() const { return v2_; } +#endif + + template<> + __forceinline void TriangleMi<4>::gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + const Scene* const scene) const + { + const Triangle tri0 = loadTriangle(0,scene); + const Triangle tri1 = loadTriangle(1,scene); + const Triangle tri2 = loadTriangle(2,scene); + const Triangle tri3 = loadTriangle(3,scene); + transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z); + transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z); + transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z); + } + + template<> + __forceinline void TriangleMi<4>::gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + const TriangleMesh* mesh, + const Scene *const scene, + const int itime) const + { + const Triangle tri0 = loadTriangle(0,itime,mesh); + const Triangle tri1 = loadTriangle(1,itime,mesh); + const Triangle tri2 = loadTriangle(2,itime,mesh); + const Triangle tri3 = loadTriangle(3,itime,mesh); + transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z); + transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z); + transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z); + } + + template<> + __forceinline void TriangleMi<4>::gather(Vec3vf4& p0, + Vec3vf4& p1, + Vec3vf4& p2, + const Scene *const scene, + const float time) const + { + const TriangleMesh* mesh = scene->get(geomID(0)); // in mblur mode all geometries are identical + + float ftime; + const int itime = mesh->timeSegment(time, ftime); + + Vec3vf4 a0,a1,a2; gather(a0,a1,a2,mesh,scene,itime); + Vec3vf4 b0,b1,b2; gather(b0,b1,b2,mesh,scene,itime+1); + p0 = lerp(a0,b0,vfloat4(ftime)); + p1 = lerp(a1,b1,vfloat4(ftime)); + p2 = lerp(a2,b2,vfloat4(ftime)); + } + } + + template + typename TriangleMi::Type TriangleMi::type; + + typedef TriangleMi<4> Triangle4i; +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h new file mode 100644 index 00000000000..e2f106a62cd --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h @@ -0,0 +1,336 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "trianglei.h" +#include "triangle_intersector_moeller.h" +#include "triangle_intersector_pluecker.h" + +namespace embree +{ + namespace isa + { + /*! Intersects M triangles with 1 ray */ + template + struct TriangleMiIntersector1Moeller + { + typedef TriangleMi Primitive; + typedef MoellerTrumboreIntersector1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + pre.intersect(ray,v0,v1,v2,/*UVIdentity(),*/Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + return pre.intersect(ray,v0,v1,v2,/*UVIdentity(),*/Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1::pointQuery(query, context, tri); + } + }; + + /*! Intersects M triangles with K rays */ + template + struct TriangleMiIntersectorKMoeller + { + typedef TriangleMi Primitive; + typedef MoellerTrumboreIntersectorK Precalculations; + + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive& tri) + { + const Scene* scene = context->scene; + for (size_t i=0; i::size()); + const Vec3vf v0 = tri.template getVertex<0>(i,scene); + const Vec3vf v1 = tri.template getVertex<1>(i,scene); + const Vec3vf v2 = tri.template getVertex<2>(i,scene); + pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity(),*/IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive& tri) + { + vbool valid0 = valid_i; + const Scene* scene = context->scene; + + for (size_t i=0; i::size()); + const Vec3vf v0 = tri.template getVertex<0>(i,scene); + const Vec3vf v1 = tri.template getVertex<1>(i,scene); + const Vec3vf v2 = tri.template getVertex<2>(i,scene); + pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity(),*/OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + pre.intersect(ray,k,v0,v1,v2,/*UVIdentity(),*/Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + return pre.intersect(ray,k,v0,v1,v2,/*UVIdentity(),*/Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); + } + }; + + /*! Intersects M triangles with 1 ray */ + template + struct TriangleMiIntersector1Pluecker + { + typedef TriangleMi Primitive; + typedef PlueckerIntersector1 Precalculations; + + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + pre.intersect(ray,v0,v1,v2,UVIdentity(),Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + return pre.intersect(ray,v0,v1,v2,UVIdentity(),Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1::pointQuery(query, context, tri); + } + }; + + /*! Intersects M triangles with K rays */ + template + struct TriangleMiIntersectorKPluecker + { + typedef TriangleMi Primitive; + typedef PlueckerIntersectorK Precalculations; + + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive& tri) + { + const Scene* scene = context->scene; + for (size_t i=0; i::size()); + const Vec3vf v0 = tri.template getVertex<0>(i,scene); + const Vec3vf v1 = tri.template getVertex<1>(i,scene); + const Vec3vf v2 = tri.template getVertex<2>(i,scene); + pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity(),IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive& tri) + { + vbool valid0 = valid_i; + const Scene* scene = context->scene; + + for (size_t i=0; i::size()); + const Vec3vf v0 = tri.template getVertex<0>(i,scene); + const Vec3vf v1 = tri.template getVertex<1>(i,scene); + const Vec3vf v2 = tri.template getVertex<2>(i,scene); + pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity(),OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + pre.intersect(ray,k,v0,v1,v2,UVIdentity(),Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf v0, v1, v2; tri.gather(v0,v1,v2,context->scene); + return pre.intersect(ray,k,v0,v1,v2,UVIdentity(),Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); + } + }; + + /*! Intersects M motion blur triangles with 1 ray */ + template + struct TriangleMiMBIntersector1Moeller + { + typedef TriangleMi Primitive; + typedef MoellerTrumboreIntersector1 Precalculations; + + /*! Intersect a ray with the M triangles and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()); + pre.intersect(ray,v0,v1,v2,/*UVIdentity(),*/Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()); + return pre.intersect(ray,v0,v1,v2,/*UVIdentity(),*/Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1::pointQuery(query, context, tri); + } + }; + + /*! Intersects M motion blur triangles with K rays. */ + template + struct TriangleMiMBIntersectorKMoeller + { + typedef TriangleMi Primitive; + typedef MoellerTrumboreIntersectorK Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const TriangleMi& tri) + { + for (size_t i=0; i::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + Vec3vf v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time()); + pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity(),*/IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const TriangleMi& tri) + { + vbool valid0 = valid_i; + for (size_t i=0; i::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + Vec3vf v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time()); + pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity(),*/OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const TriangleMi& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]); + pre.intersect(ray,k,v0,v1,v2,/*UVIdentity(),*/Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const TriangleMi& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]); + return pre.intersect(ray,k,v0,v1,v2,/*UVIdentity(),*/Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); + } + }; + + /*! Intersects M motion blur triangles with 1 ray */ + template + struct TriangleMiMBIntersector1Pluecker + { + typedef TriangleMi Primitive; + typedef PlueckerIntersector1 Precalculations; + + /*! Intersect a ray with the M triangles and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()); + pre.intersect(ray,v0,v1,v2,UVIdentity(),Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()); + return pre.intersect(ray,v0,v1,v2,UVIdentity(),Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1::pointQuery(query, context, tri); + } + }; + + /*! Intersects M motion blur triangles with K rays. */ + template + struct TriangleMiMBIntersectorKPluecker + { + typedef TriangleMi Primitive; + typedef PlueckerIntersectorK Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const TriangleMi& tri) + { + for (size_t i=0; i::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + Vec3vf v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time()); + pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity(),IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const TriangleMi& tri) + { + vbool valid0 = valid_i; + for (size_t i=0; i::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + Vec3vf v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time()); + pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity(),OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const TriangleMi& tri) + { + STAT3(normal.trav_prims,1,1,1); + Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]); + pre.intersect(ray,k,v0,v1,v2,UVIdentity(),Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const TriangleMi& tri) + { + STAT3(shadow.trav_prims,1,1,1); + Vec3vf v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]); + return pre.intersect(ray,k,v0,v1,v2,UVIdentity(),Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev.h new file mode 100644 index 00000000000..19af389e73e --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev.h @@ -0,0 +1,157 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" + +namespace embree +{ + /* Stores the vertices of M triangles in struct of array layout */ + template + struct TriangleMv + { + public: + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + static Type type; + + public: + + /* Returns maximum number of stored triangles */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + public: + + /* Default constructor */ + __forceinline TriangleMv() {} + + /* Construction from vertices and IDs */ + __forceinline TriangleMv(const Vec3vf& v0, const Vec3vf& v1, const Vec3vf& v2, const vuint& geomIDs, const vuint& primIDs) + : v0(v0), v1(v1), v2(v2), geomIDs(geomIDs), primIDs(primIDs) {} + + /* Returns a mask that tells which triangles are valid */ + __forceinline vbool valid() const { return geomIDs != vuint(-1); } + + /* Returns true if the specified triangle is valid */ + __forceinline bool valid(const size_t i) const { assert(i& geomID() { return geomIDs; } + __forceinline const vuint& geomID() const { return geomIDs; } + __forceinline unsigned int geomID(const size_t i) const { assert(i& primID() { return primIDs; } + __forceinline const vuint& primID() const { return primIDs; } + __forceinline unsigned int primID(const size_t i) const { assert(i lower = min(v0,v1,v2); + Vec3vf upper = max(v0,v1,v2); + vbool mask = valid(); + lower.x = select(mask,lower.x,vfloat(pos_inf)); + lower.y = select(mask,lower.y,vfloat(pos_inf)); + lower.z = select(mask,lower.z,vfloat(pos_inf)); + upper.x = select(mask,upper.x,vfloat(neg_inf)); + upper.y = select(mask,upper.y,vfloat(neg_inf)); + upper.z = select(mask,upper.z,vfloat(neg_inf)); + return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), + Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); + } + + /* Non temporal store */ + __forceinline static void store_nt(TriangleMv* dst, const TriangleMv& src) + { + vfloat::store_nt(&dst->v0.x,src.v0.x); + vfloat::store_nt(&dst->v0.y,src.v0.y); + vfloat::store_nt(&dst->v0.z,src.v0.z); + vfloat::store_nt(&dst->v1.x,src.v1.x); + vfloat::store_nt(&dst->v1.y,src.v1.y); + vfloat::store_nt(&dst->v1.z,src.v1.z); + vfloat::store_nt(&dst->v2.x,src.v2.x); + vfloat::store_nt(&dst->v2.y,src.v2.y); + vfloat::store_nt(&dst->v2.z,src.v2.z); + vuint::store_nt(&dst->geomIDs,src.geomIDs); + vuint::store_nt(&dst->primIDs,src.primIDs); + } + + /* Fill triangle from triangle list */ + __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene) + { + vuint vgeomID = -1, vprimID = -1; + Vec3vf v0 = zero, v1 = zero, v2 = zero; + + for (size_t i=0; iget(geomID); + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + vgeomID [i] = geomID; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + } + TriangleMv::store_nt(this,TriangleMv(v0,v1,v2,vgeomID,vprimID)); + } + + /* Updates the primitive */ + __forceinline BBox3fa update(TriangleMesh* mesh) + { + BBox3fa bounds = empty; + vuint vgeomID = -1, vprimID = -1; + Vec3vf v0 = zero, v1 = zero, v2 = zero; + + for (size_t i=0; itriangle(primId); + const Vec3fa p0 = mesh->vertex(tri.v[0]); + const Vec3fa p1 = mesh->vertex(tri.v[1]); + const Vec3fa p2 = mesh->vertex(tri.v[2]); + bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2))); + vgeomID [i] = geomId; + vprimID [i] = primId; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + } + new (this) TriangleMv(v0,v1,v2,vgeomID,vprimID); + return bounds; + } + + public: + Vec3vf v0; // 1st vertex of the triangles + Vec3vf v1; // 2nd vertex of the triangles + Vec3vf v2; // 3rd vertex of the triangles + private: + vuint geomIDs; // geometry ID + vuint primIDs; // primitive ID + }; + + template + typename TriangleMv::Type TriangleMv::type; + + typedef TriangleMv<4> Triangle4v; +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h new file mode 100644 index 00000000000..6af0d5a11c0 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h @@ -0,0 +1,206 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "triangle.h" +#include "triangle_intersector_pluecker.h" +#include "triangle_intersector_moeller.h" +#include "triangle_intersector_woop.h" + +namespace embree +{ + namespace isa + { + /*! Intersects M triangles with 1 ray */ + template + struct TriangleMvIntersector1Moeller + { + typedef TriangleMv Primitive; + typedef MoellerTrumboreIntersector1 Precalculations; + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity(),*/Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity(),*/Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1::pointQuery(query, context, tri); + } + }; + + + template + struct TriangleMvIntersector1Woop + { + typedef TriangleMv Primitive; + typedef WoopIntersector1 intersec; + typedef WoopPrecalculations1 Precalculations; + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + return intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1::pointQuery(query, context, tri); + } + }; + + + /*! Intersects M triangles with K rays */ + template + struct TriangleMvIntersectorKMoeller + { + typedef TriangleMv Primitive; + typedef MoellerTrumboreIntersectorK Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive& tri) + { + for (size_t i=0; i v0 = broadcast>(tri.v0,i); + const Vec3vf v1 = broadcast>(tri.v1,i); + const Vec3vf v2 = broadcast>(tri.v2,i); + pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity(),*/IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive& tri) + { + vbool valid0 = valid_i; + + for (size_t i=0; i v0 = broadcast>(tri.v0,i); + const Vec3vf v1 = broadcast>(tri.v1,i); + const Vec3vf v2 = broadcast>(tri.v2,i); + pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity(),*/OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity(),*/Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity(),*/Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx + } + }; + + /*! Intersects M triangles with 1 ray */ + template + struct TriangleMvIntersector1Pluecker + { + typedef TriangleMv Primitive; + typedef PlueckerIntersector1 Precalculations; + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity(),Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity(),Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1::pointQuery(query, context, tri); + } + }; + + /*! Intersects M triangles with K rays */ + template + struct TriangleMvIntersectorKPluecker + { + typedef TriangleMv Primitive; + typedef PlueckerIntersectorK Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const Primitive& tri) + { + for (size_t i=0; i v0 = broadcast>(tri.v0,i); + const Vec3vf v1 = broadcast>(tri.v1,i); + const Vec3vf v2 = broadcast>(tri.v2,i); + pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity(),IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const Primitive& tri) + { + vbool valid0 = valid_i; + + for (size_t i=0; i v0 = broadcast>(tri.v0,i); + const Vec3vf v1 = broadcast>(tri.v1,i); + const Vec3vf v2 = broadcast>(tri.v2,i); + pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity(),OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(normal.trav_prims,1,1,1); + pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity(),Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const Primitive& tri) + { + STAT3(shadow.trav_prims,1,1,1); + return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity(),Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h new file mode 100644 index 00000000000..63137aee166 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h @@ -0,0 +1,201 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "primitive.h" + +namespace embree +{ + /* Stores the vertices of M triangles in struct of array layout */ + template + struct TriangleMvMB + { + public: + struct Type : public PrimitiveType + { + const char* name() const; + size_t sizeActive(const char* This) const; + size_t sizeTotal(const char* This) const; + size_t getBytes(const char* This) const; + }; + + static Type type; + + public: + + /* primitive supports single time segments */ + static const bool singleTimeSegment = true; + + /* Returns maximum number of stored triangles */ + static __forceinline size_t max_size() { return M; } + + /* Returns required number of primitive blocks for N primitives */ + static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); } + + public: + + /* Default constructor */ + __forceinline TriangleMvMB() {} + + /* Construction from vertices and IDs */ + __forceinline TriangleMvMB(const Vec3vf& a0, const Vec3vf& a1, + const Vec3vf& b0, const Vec3vf& b1, + const Vec3vf& c0, const Vec3vf& c1, + const vuint& geomIDs, const vuint& primIDs) + : v0(a0), v1(b0), v2(c0), dv0(a1-a0), dv1(b1-b0), dv2(c1-c0), geomIDs(geomIDs), primIDs(primIDs) {} + + /* Returns a mask that tells which triangles are valid */ + __forceinline vbool valid() const { return geomIDs != vuint(-1); } + + /* Returns if the specified triangle is valid */ + __forceinline bool valid(const size_t i) const { assert(i& geomID() { return geomIDs; } + __forceinline const vuint& geomID() const { return geomIDs; } + __forceinline unsigned int geomID(const size_t i) const { assert(i& primID() { return primIDs; } + __forceinline const vuint& primID() const { return primIDs; } + __forceinline unsigned int primID(const size_t i) const { assert(i lower = min(v0,v1,v2); + Vec3vf upper = max(v0,v1,v2); + const vbool mask = valid(); + lower.x = select(mask,lower.x,vfloat(pos_inf)); + lower.y = select(mask,lower.y,vfloat(pos_inf)); + lower.z = select(mask,lower.z,vfloat(pos_inf)); + upper.x = select(mask,upper.x,vfloat(neg_inf)); + upper.y = select(mask,upper.y,vfloat(neg_inf)); + upper.z = select(mask,upper.z,vfloat(neg_inf)); + return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), + Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); + } + + /* Calculate the bounds of the triangles at t1 */ + __forceinline BBox3fa bounds1() const + { + const Vec3vf p0 = v0+dv0; + const Vec3vf p1 = v1+dv1; + const Vec3vf p2 = v2+dv2; + Vec3vf lower = min(p0,p1,p2); + Vec3vf upper = max(p0,p1,p2); + const vbool mask = valid(); + lower.x = select(mask,lower.x,vfloat(pos_inf)); + lower.y = select(mask,lower.y,vfloat(pos_inf)); + lower.z = select(mask,lower.z,vfloat(pos_inf)); + upper.x = select(mask,upper.x,vfloat(neg_inf)); + upper.y = select(mask,upper.y,vfloat(neg_inf)); + upper.z = select(mask,upper.z,vfloat(neg_inf)); + return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)), + Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z))); + } + + /* Calculate the linear bounds of the primitive */ + __forceinline LBBox3fa linearBounds() const { + return LBBox3fa(bounds0(),bounds1()); + } + + /* Fill triangle from triangle list */ + __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime) + { + vuint vgeomID = -1, vprimID = -1; + Vec3vf va0 = zero, vb0 = zero, vc0 = zero; + Vec3vf va1 = zero, vb1 = zero, vc1 = zero; + + BBox3fa bounds0 = empty; + BBox3fa bounds1 = empty; + + for (size_t i=0; iget(geomID); + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const Vec3fa& a0 = mesh->vertex(tri.v[0],itime+0); bounds0.extend(a0); + const Vec3fa& a1 = mesh->vertex(tri.v[0],itime+1); bounds1.extend(a1); + const Vec3fa& b0 = mesh->vertex(tri.v[1],itime+0); bounds0.extend(b0); + const Vec3fa& b1 = mesh->vertex(tri.v[1],itime+1); bounds1.extend(b1); + const Vec3fa& c0 = mesh->vertex(tri.v[2],itime+0); bounds0.extend(c0); + const Vec3fa& c1 = mesh->vertex(tri.v[2],itime+1); bounds1.extend(c1); + vgeomID [i] = geomID; + vprimID [i] = primID; + va0.x[i] = a0.x; va0.y[i] = a0.y; va0.z[i] = a0.z; + va1.x[i] = a1.x; va1.y[i] = a1.y; va1.z[i] = a1.z; + vb0.x[i] = b0.x; vb0.y[i] = b0.y; vb0.z[i] = b0.z; + vb1.x[i] = b1.x; vb1.y[i] = b1.y; vb1.z[i] = b1.z; + vc0.x[i] = c0.x; vc0.y[i] = c0.y; vc0.z[i] = c0.z; + vc1.x[i] = c1.x; vc1.y[i] = c1.y; vc1.z[i] = c1.z; + } + new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID); + return LBBox3fa(bounds0,bounds1); + } + + /* Fill triangle from triangle list */ + __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range) + { + vuint vgeomID = -1, vprimID = -1; + Vec3vf va0 = zero, vb0 = zero, vc0 = zero; + Vec3vf va1 = zero, vb1 = zero, vc1 = zero; + + LBBox3fa allBounds = empty; + for (size_t i=0; iget(geomID); + const range itime_range = mesh->timeSegmentRange(time_range); + assert(itime_range.size() == 1); + const int ilower = itime_range.begin(); + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + allBounds.extend(mesh->linearBounds(primID, time_range)); + const Vec3fa& a0 = mesh->vertex(tri.v[0],ilower+0); + const Vec3fa& a1 = mesh->vertex(tri.v[0],ilower+1); + const Vec3fa& b0 = mesh->vertex(tri.v[1],ilower+0); + const Vec3fa& b1 = mesh->vertex(tri.v[1],ilower+1); + const Vec3fa& c0 = mesh->vertex(tri.v[2],ilower+0); + const Vec3fa& c1 = mesh->vertex(tri.v[2],ilower+1); + const BBox1f time_range_v(mesh->timeStep(ilower+0),mesh->timeStep(ilower+1)); + auto a01 = globalLinear(std::make_pair(a0,a1),time_range_v); + auto b01 = globalLinear(std::make_pair(b0,b1),time_range_v); + auto c01 = globalLinear(std::make_pair(c0,c1),time_range_v); + vgeomID [i] = geomID; + vprimID [i] = primID; + va0.x[i] = a01.first .x; va0.y[i] = a01.first .y; va0.z[i] = a01.first .z; + va1.x[i] = a01.second.x; va1.y[i] = a01.second.y; va1.z[i] = a01.second.z; + vb0.x[i] = b01.first .x; vb0.y[i] = b01.first .y; vb0.z[i] = b01.first .z; + vb1.x[i] = b01.second.x; vb1.y[i] = b01.second.y; vb1.z[i] = b01.second.z; + vc0.x[i] = c01.first .x; vc0.y[i] = c01.first .y; vc0.z[i] = c01.first .z; + vc1.x[i] = c01.second.x; vc1.y[i] = c01.second.y; vc1.z[i] = c01.second.z; + } + new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID); + return allBounds; + } + + public: + Vec3vf v0; // 1st vertex of the triangles + Vec3vf v1; // 2nd vertex of the triangles + Vec3vf v2; // 3rd vertex of the triangles + Vec3vf dv0; // difference vector between time steps t0 and t1 for first vertex + Vec3vf dv1; // difference vector between time steps t0 and t1 for second vertex + Vec3vf dv2; // difference vector between time steps t0 and t1 for third vertex + private: + vuint geomIDs; // geometry ID + vuint primIDs; // primitive ID + }; + + template + typename TriangleMvMB::Type TriangleMvMB::type; + + typedef TriangleMvMB<4> Triangle4vMB; +} diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h new file mode 100644 index 00000000000..35a260d826e --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h @@ -0,0 +1,211 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "triangle.h" +#include "intersector_epilog.h" + +namespace embree +{ + namespace isa + { + /*! Intersects M motion blur triangles with 1 ray */ + template + struct TriangleMvMBIntersector1Moeller + { + typedef TriangleMvMB Primitive; + typedef MoellerTrumboreIntersector1 Precalculations; + + /*! Intersect a ray with the M triangles and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB& tri) + { + STAT3(normal.trav_prims,1,1,1); + const Vec3vf time(ray.time()); + const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); + const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); + const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); + pre.intersect(ray,v0,v1,v2,Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB& tri) + { + STAT3(shadow.trav_prims,1,1,1); + const Vec3vf time(ray.time()); + const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); + const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); + const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); + return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1::pointQuery(query, context, tri); + } + }; + + /*! Intersects M motion blur triangles with K rays. */ + template + struct TriangleMvMBIntersectorKMoeller + { + typedef TriangleMvMB Primitive; + typedef MoellerTrumboreIntersectorK Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const TriangleMvMB& tri) + { + for (size_t i=0; i::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf time(ray.time()); + const Vec3vf v0 = madd(time,broadcast>(tri.dv0,i),broadcast>(tri.v0,i)); + const Vec3vf v1 = madd(time,broadcast>(tri.dv1,i),broadcast>(tri.v1,i)); + const Vec3vf v2 = madd(time,broadcast>(tri.dv2,i),broadcast>(tri.v2,i)); + pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const TriangleMvMB& tri) + { + vbool valid0 = valid_i; + + for (size_t i=0; i::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + const Vec3vf time(ray.time()); + const Vec3vf v0 = madd(time,broadcast>(tri.dv0,i),broadcast>(tri.v0,i)); + const Vec3vf v1 = madd(time,broadcast>(tri.dv1,i),broadcast>(tri.v1,i)); + const Vec3vf v2 = madd(time,broadcast>(tri.dv2,i),broadcast>(tri.v2,i)); + pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const TriangleMvMB& tri) + { + STAT3(normal.trav_prims,1,1,1); + const Vec3vf time(ray.time()[k]); + const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); + const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); + const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); + pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const TriangleMvMB& tri) + { + STAT3(shadow.trav_prims,1,1,1); + const Vec3vf time(ray.time()[k]); + const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); + const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); + const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); + return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); + } + }; + + /*! Intersects M motion blur triangles with 1 ray */ + template + struct TriangleMvMBIntersector1Pluecker + { + typedef TriangleMvMB Primitive; + typedef PlueckerIntersector1 Precalculations; + + /*! Intersect a ray with the M triangles and updates the hit. */ + static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB& tri) + { + STAT3(normal.trav_prims,1,1,1); + const Vec3vf time(ray.time()); + const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); + const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); + const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); + pre.intersect(ray,v0,v1,v2,UVIdentity(),Intersect1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of M triangles. */ + static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB& tri) + { + STAT3(shadow.trav_prims,1,1,1); + const Vec3vf time(ray.time()); + const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); + const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); + const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); + return pre.intersect(ray,v0,v1,v2,UVIdentity(),Occluded1EpilogM(ray,context,tri.geomID(),tri.primID())); + } + + static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri) + { + return PrimitivePointQuery1::pointQuery(query, context, tri); + } + }; + + /*! Intersects M motion blur triangles with K rays. */ + template + struct TriangleMvMBIntersectorKPluecker + { + typedef TriangleMvMB Primitive; + typedef PlueckerIntersectorK Precalculations; + + /*! Intersects K rays with M triangles. */ + static __forceinline void intersect(const vbool& valid_i, Precalculations& pre, RayHitK& ray, IntersectContext* context, const TriangleMvMB& tri) + { + for (size_t i=0; i::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(normal.trav_prims,1,popcnt(valid_i),K); + const Vec3vf time(ray.time()); + const Vec3vf v0 = madd(time,broadcast>(tri.dv0,i),broadcast>(tri.v0,i)); + const Vec3vf v1 = madd(time,broadcast>(tri.dv1,i),broadcast>(tri.v1,i)); + const Vec3vf v2 = madd(time,broadcast>(tri.dv2,i),broadcast>(tri.v2,i)); + pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity(),IntersectKEpilogM(ray,context,tri.geomID(),tri.primID(),i)); + } + } + + /*! Test for K rays if they are occluded by any of the M triangles. */ + static __forceinline vbool occluded(const vbool& valid_i, Precalculations& pre, RayK& ray, IntersectContext* context, const TriangleMvMB& tri) + { + vbool valid0 = valid_i; + + for (size_t i=0; i::max_size(); i++) + { + if (!tri.valid(i)) break; + STAT3(shadow.trav_prims,1,popcnt(valid0),K); + const Vec3vf time(ray.time()); + const Vec3vf v0 = madd(time,broadcast>(tri.dv0,i),broadcast>(tri.v0,i)); + const Vec3vf v1 = madd(time,broadcast>(tri.dv1,i),broadcast>(tri.v1,i)); + const Vec3vf v2 = madd(time,broadcast>(tri.dv2,i),broadcast>(tri.v2,i)); + pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity(),OccludedKEpilogM(valid0,ray,context,tri.geomID(),tri.primID(),i)); + if (none(valid0)) break; + } + return !valid0; + } + + /*! Intersect a ray with M triangles and updates the hit. */ + static __forceinline void intersect(Precalculations& pre, RayHitK& ray, size_t k, IntersectContext* context, const TriangleMvMB& tri) + { + STAT3(normal.trav_prims,1,1,1); + const Vec3vf time(ray.time()[k]); + const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); + const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); + const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); + pre.intersect(ray,k,v0,v1,v2,UVIdentity(),Intersect1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); + } + + /*! Test if the ray is occluded by one of the M triangles. */ + static __forceinline bool occluded(Precalculations& pre, RayK& ray, size_t k, IntersectContext* context, const TriangleMvMB& tri) + { + STAT3(shadow.trav_prims,1,1,1); + const Vec3vf time(ray.time()[k]); + const Vec3vf v0 = madd(time,Vec3vf(tri.dv0),Vec3vf(tri.v0)); + const Vec3vf v1 = madd(time,Vec3vf(tri.dv1),Vec3vf(tri.v1)); + const Vec3vf v2 = madd(time,Vec3vf(tri.dv2),Vec3vf(tri.v2)); + return pre.intersect(ray,k,v0,v1,v2,UVIdentity(),Occluded1KEpilogM(ray,k,context,tri.geomID(),tri.primID())); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/hash.h b/thirdparty/embree-aarch64/kernels/hash.h new file mode 100644 index 00000000000..4abbe203d6b --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/hash.h @@ -0,0 +1,5 @@ + +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#define RTC_HASH "6ef362f99af80c9dfe8dd2bfc582d9067897edc6" diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bezier_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/bezier_curve.h new file mode 100644 index 00000000000..c0e78820f8e --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/bezier_curve.h @@ -0,0 +1,669 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "../common/scene_curves.h" + +namespace embree +{ + class BezierBasis + { + public: + + template + static __forceinline Vec4 eval(const T& u) + { + const T t1 = u; + const T t0 = 1.0f-t1; + const T B0 = t0 * t0 * t0; + const T B1 = 3.0f * t1 * (t0 * t0); + const T B2 = 3.0f * (t1 * t1) * t0; + const T B3 = t1 * t1 * t1; + return Vec4(B0,B1,B2,B3); + } + + template + static __forceinline Vec4 derivative(const T& u) + { + const T t1 = u; + const T t0 = 1.0f-t1; + const T B0 = -(t0*t0); + const T B1 = madd(-2.0f,t0*t1,t0*t0); + const T B2 = msub(+2.0f,t0*t1,t1*t1); + const T B3 = +(t1*t1); + return T(3.0f)*Vec4(B0,B1,B2,B3); + } + + template + static __forceinline Vec4 derivative2(const T& u) + { + const T t1 = u; + const T t0 = 1.0f-t1; + const T B0 = t0; + const T B1 = madd(-2.0f,t0,t1); + const T B2 = madd(-2.0f,t1,t0); + const T B3 = t1; + return T(6.0f)*Vec4(B0,B1,B2,B3); + } + }; + + struct PrecomputedBezierBasis + { + enum { N = 16 }; + public: + PrecomputedBezierBasis() {} + PrecomputedBezierBasis(int shift); + + /* basis for bezier evaluation */ + public: + float c0[N+1][N+1]; + float c1[N+1][N+1]; + float c2[N+1][N+1]; + float c3[N+1][N+1]; + + /* basis for bezier derivative evaluation */ + public: + float d0[N+1][N+1]; + float d1[N+1][N+1]; + float d2[N+1][N+1]; + float d3[N+1][N+1]; + }; + extern PrecomputedBezierBasis bezier_basis0; + extern PrecomputedBezierBasis bezier_basis1; + + + template + struct LinearBezierCurve + { + V v0,v1; + + __forceinline LinearBezierCurve () {} + + __forceinline LinearBezierCurve (const LinearBezierCurve& other) + : v0(other.v0), v1(other.v1) {} + + __forceinline LinearBezierCurve& operator= (const LinearBezierCurve& other) { + v0 = other.v0; v1 = other.v1; return *this; + } + + __forceinline LinearBezierCurve (const V& v0, const V& v1) + : v0(v0), v1(v1) {} + + __forceinline V begin() const { return v0; } + __forceinline V end () const { return v1; } + + bool hasRoot() const; + + friend embree_ostream operator<<(embree_ostream cout, const LinearBezierCurve& a) { + return cout << "LinearBezierCurve (" << a.v0 << ", " << a.v1 << ")"; + } + }; + + template<> __forceinline bool LinearBezierCurve::hasRoot() const { + return numRoots(v0,v1); + } + + template + struct QuadraticBezierCurve + { + V v0,v1,v2; + + __forceinline QuadraticBezierCurve () {} + + __forceinline QuadraticBezierCurve (const QuadraticBezierCurve& other) + : v0(other.v0), v1(other.v1), v2(other.v2) {} + + __forceinline QuadraticBezierCurve& operator= (const QuadraticBezierCurve& other) { + v0 = other.v0; v1 = other.v1; v2 = other.v2; return *this; + } + + __forceinline QuadraticBezierCurve (const V& v0, const V& v1, const V& v2) + : v0(v0), v1(v1), v2(v2) {} + + __forceinline V begin() const { return v0; } + __forceinline V end () const { return v2; } + + __forceinline V interval() const { + return merge(v0,v1,v2); + } + + __forceinline BBox bounds() const { + return merge(BBox(v0),BBox(v1),BBox(v2)); + } + + friend embree_ostream operator<<(embree_ostream cout, const QuadraticBezierCurve& a) { + return cout << "QuadraticBezierCurve ( (" << a.u.lower << ", " << a.u.upper << "), " << a.v0 << ", " << a.v1 << ", " << a.v2 << ")"; + } + }; + + + typedef QuadraticBezierCurve QuadraticBezierCurve1f; + typedef QuadraticBezierCurve QuadraticBezierCurve2fa; + typedef QuadraticBezierCurve QuadraticBezierCurve3fa; + + template + struct CubicBezierCurve + { + Vertex v0,v1,v2,v3; + + __forceinline CubicBezierCurve() {} + + template + __forceinline CubicBezierCurve (const CubicBezierCurve& other) + : v0(other.v0), v1(other.v1), v2(other.v2), v3(other.v3) {} + + __forceinline CubicBezierCurve& operator= (const CubicBezierCurve& other) { + v0 = other.v0; v1 = other.v1; v2 = other.v2; v3 = other.v3; return *this; + } + + __forceinline CubicBezierCurve(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3) + : v0(v0), v1(v1), v2(v2), v3(v3) {} + + __forceinline Vertex begin() const { + return v0; + } + + __forceinline Vertex end() const { + return v3; + } + + __forceinline Vertex center() const { + return 0.25f*(v0+v1+v2+v3); + } + + __forceinline Vertex begin_direction() const { + return v1-v0; + } + + __forceinline Vertex end_direction() const { + return v3-v2; + } + + __forceinline CubicBezierCurve xfm(const Vertex& dx) const { + return CubicBezierCurve(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx)); + } + + __forceinline CubicBezierCurve vxfm(const Vertex& dx) const { + return CubicBezierCurve(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx)); + } + + __forceinline CubicBezierCurve xfm(const Vertex& dx, const Vertex& p) const { + return CubicBezierCurve(dot(v0-p,dx),dot(v1-p,dx),dot(v2-p,dx),dot(v3-p,dx)); + } + + __forceinline CubicBezierCurve xfm(const LinearSpace3fa& space) const + { + const Vec3fa q0 = xfmVector(space,v0); + const Vec3fa q1 = xfmVector(space,v1); + const Vec3fa q2 = xfmVector(space,v2); + const Vec3fa q3 = xfmVector(space,v3); + return CubicBezierCurve(q0,q1,q2,q3); + } + + __forceinline CubicBezierCurve xfm(const LinearSpace3fa& space, const Vec3fa& p) const + { + const Vec3fa q0 = xfmVector(space,v0-p); + const Vec3fa q1 = xfmVector(space,v1-p); + const Vec3fa q2 = xfmVector(space,v2-p); + const Vec3fa q3 = xfmVector(space,v3-p); + return CubicBezierCurve(q0,q1,q2,q3); + } + + __forceinline CubicBezierCurve xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const + { + const Vec3ff q0(xfmVector(space,(Vec3fa)v0-p), v0.w); + const Vec3ff q1(xfmVector(space,(Vec3fa)v1-p), v1.w); + const Vec3ff q2(xfmVector(space,(Vec3fa)v2-p), v2.w); + const Vec3ff q3(xfmVector(space,(Vec3fa)v3-p), v3.w); + return CubicBezierCurve(q0,q1,q2,q3); + } + + __forceinline CubicBezierCurve xfm(const LinearSpace3fa& space, const Vec3fa& p, const float s) const + { + const Vec3fa q0 = xfmVector(space,s*(v0-p)); + const Vec3fa q1 = xfmVector(space,s*(v1-p)); + const Vec3fa q2 = xfmVector(space,s*(v2-p)); + const Vec3fa q3 = xfmVector(space,s*(v3-p)); + return CubicBezierCurve(q0,q1,q2,q3); + } + + __forceinline int maxRoots() const; + + __forceinline BBox bounds() const { + return merge(BBox(v0),BBox(v1),BBox(v2),BBox(v3)); + } + + __forceinline friend CubicBezierCurve operator +( const CubicBezierCurve& a, const CubicBezierCurve& b ) { + return CubicBezierCurve(a.v0+b.v0,a.v1+b.v1,a.v2+b.v2,a.v3+b.v3); + } + + __forceinline friend CubicBezierCurve operator -( const CubicBezierCurve& a, const CubicBezierCurve& b ) { + return CubicBezierCurve(a.v0-b.v0,a.v1-b.v1,a.v2-b.v2,a.v3-b.v3); + } + + __forceinline friend CubicBezierCurve operator -( const CubicBezierCurve& a, const Vertex& b ) { + return CubicBezierCurve(a.v0-b,a.v1-b,a.v2-b,a.v3-b); + } + + __forceinline friend CubicBezierCurve operator *( const Vertex& a, const CubicBezierCurve& b ) { + return CubicBezierCurve(a*b.v0,a*b.v1,a*b.v2,a*b.v3); + } + + __forceinline friend CubicBezierCurve cmadd( const Vertex& a, const CubicBezierCurve& b, const CubicBezierCurve& c) { + return CubicBezierCurve(madd(a,b.v0,c.v0),madd(a,b.v1,c.v1),madd(a,b.v2,c.v2),madd(a,b.v3,c.v3)); + } + + __forceinline friend CubicBezierCurve clerp ( const CubicBezierCurve& a, const CubicBezierCurve& b, const Vertex& t ) { + return cmadd((Vertex(1.0f)-t),a,t*b); + } + + __forceinline friend CubicBezierCurve merge ( const CubicBezierCurve& a, const CubicBezierCurve& b ) { + return CubicBezierCurve(merge(a.v0,b.v0),merge(a.v1,b.v1),merge(a.v2,b.v2),merge(a.v3,b.v3)); + } + + __forceinline void split(CubicBezierCurve& left, CubicBezierCurve& right, const float t = 0.5f) const + { + const Vertex p00 = v0; + const Vertex p01 = v1; + const Vertex p02 = v2; + const Vertex p03 = v3; + + const Vertex p10 = lerp(p00,p01,t); + const Vertex p11 = lerp(p01,p02,t); + const Vertex p12 = lerp(p02,p03,t); + const Vertex p20 = lerp(p10,p11,t); + const Vertex p21 = lerp(p11,p12,t); + const Vertex p30 = lerp(p20,p21,t); + + new (&left ) CubicBezierCurve(p00,p10,p20,p30); + new (&right) CubicBezierCurve(p30,p21,p12,p03); + } + + __forceinline CubicBezierCurve split() const + { + const float u0 = 0.0f, u1 = 1.0f; + const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1))); + const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1))); + Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale); + const Vec2vfx P3 = shift_right_1(P0); + const Vec2vfx dP3du = shift_right_1(dP0du); + const Vec2vfx P1 = P0 + dP0du; + const Vec2vfx P2 = P3 - dP3du; + return CubicBezierCurve(P0,P1,P2,P3); + } + + __forceinline CubicBezierCurve split(const BBox1f& u) const + { + const float u0 = u.lower, u1 = u.upper; + const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1))); + const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1))); + Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale); + const Vec2vfx P3 = shift_right_1(P0); + const Vec2vfx dP3du = shift_right_1(dP0du); + const Vec2vfx P1 = P0 + dP0du; + const Vec2vfx P2 = P3 - dP3du; + return CubicBezierCurve(P0,P1,P2,P3); + } + + __forceinline void eval(float t, Vertex& p, Vertex& dp) const + { + const Vertex p00 = v0; + const Vertex p01 = v1; + const Vertex p02 = v2; + const Vertex p03 = v3; + + const Vertex p10 = lerp(p00,p01,t); + const Vertex p11 = lerp(p01,p02,t); + const Vertex p12 = lerp(p02,p03,t); + const Vertex p20 = lerp(p10,p11,t); + const Vertex p21 = lerp(p11,p12,t); + const Vertex p30 = lerp(p20,p21,t); + + p = p30; + dp = Vertex(3.0f)*(p21-p20); + } + +#if 0 + __forceinline Vertex eval(float t) const + { + const Vertex p00 = v0; + const Vertex p01 = v1; + const Vertex p02 = v2; + const Vertex p03 = v3; + + const Vertex p10 = lerp(p00,p01,t); + const Vertex p11 = lerp(p01,p02,t); + const Vertex p12 = lerp(p02,p03,t); + const Vertex p20 = lerp(p10,p11,t); + const Vertex p21 = lerp(p11,p12,t); + const Vertex p30 = lerp(p20,p21,t); + + return p30; + } +#else + __forceinline Vertex eval(const float t) const + { + const Vec4 b = BezierBasis::eval(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } +#endif + + __forceinline Vertex eval_dt(float t) const + { + const Vertex p00 = v1-v0; + const Vertex p01 = v2-v1; + const Vertex p02 = v3-v2; + const Vertex p10 = lerp(p00,p01,t); + const Vertex p11 = lerp(p01,p02,t); + const Vertex p20 = lerp(p10,p11,t); + return Vertex(3.0f)*p20; + } + + __forceinline Vertex eval_du(const float t) const + { + const Vec4 b = BezierBasis::derivative(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline Vertex eval_dudu(const float t) const + { + const Vec4 b = BezierBasis::derivative2(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline void evalN(const vfloatx& t, Vec2vfx& p, Vec2vfx& dp) const + { + const Vec2vfx p00 = v0; + const Vec2vfx p01 = v1; + const Vec2vfx p02 = v2; + const Vec2vfx p03 = v3; + + const Vec2vfx p10 = lerp(p00,p01,t); + const Vec2vfx p11 = lerp(p01,p02,t); + const Vec2vfx p12 = lerp(p02,p03,t); + + const Vec2vfx p20 = lerp(p10,p11,t); + const Vec2vfx p21 = lerp(p11,p12,t); + + const Vec2vfx p30 = lerp(p20,p21,t); + + p = p30; + dp = vfloatx(3.0f)*(p21-p20); + } + + __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const + { + const Vertex p00 = v0; + const Vertex p01 = v1; + const Vertex p02 = v2; + const Vertex p03 = v3; + const Vertex p10 = lerp(p00,p01,t); + const Vertex p11 = lerp(p01,p02,t); + const Vertex p12 = lerp(p02,p03,t); + const Vertex p20 = lerp(p10,p11,t); + const Vertex p21 = lerp(p11,p12,t); + const Vertex p30 = lerp(p20,p21,t); + p = p30; + dp = 3.0f*(p21-p20); + ddp = eval_dudu(t); + } + + __forceinline CubicBezierCurve clip(const Interval1f& u1) const + { + Vertex f0,df0; eval(u1.lower,f0,df0); + Vertex f1,df1; eval(u1.upper,f1,df1); + float s = u1.upper-u1.lower; + return CubicBezierCurve(f0,f0+s*(1.0f/3.0f)*df0,f1-s*(1.0f/3.0f)*df1,f1); + } + + __forceinline QuadraticBezierCurve derivative() const + { + const Vertex q0 = 3.0f*(v1-v0); + const Vertex q1 = 3.0f*(v2-v1); + const Vertex q2 = 3.0f*(v3-v2); + return QuadraticBezierCurve(q0,q1,q2); + } + + __forceinline BBox derivative_bounds(const Interval1f& u1) const + { + Vertex f0,df0; eval(u1.lower,f0,df0); + Vertex f3,df3; eval(u1.upper,f3,df3); + const float s = u1.upper-u1.lower; + const Vertex f1 = f0+s*(1.0f/3.0f)*df0; + const Vertex f2 = f3-s*(1.0f/3.0f)*df3; + const Vertex q0 = s*df0; + const Vertex q1 = 3.0f*(f2-f1); + const Vertex q2 = s*df3; + return merge(BBox(q0),BBox(q1),BBox(q2)); + } + + template + __forceinline Vec4vf veval(const vfloat& t) const + { + const Vec4vf b = BezierBasis::eval(t); + return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); + } + + template + __forceinline Vec4vf veval_du(const vfloat& t) const + { + const Vec4vf b = BezierBasis::derivative(t); + return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); + } + + template + __forceinline Vec4vf veval_dudu(const vfloat& t) const + { + const Vec4vf b = BezierBasis::derivative2(t); + return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); + } + + template + __forceinline void veval(const vfloat& t, Vec4vf& p, Vec4vf& dp) const + { + const Vec4vf p00 = v0; + const Vec4vf p01 = v1; + const Vec4vf p02 = v2; + const Vec4vf p03 = v3; + + const Vec4vf p10 = lerp(p00,p01,t); + const Vec4vf p11 = lerp(p01,p02,t); + const Vec4vf p12 = lerp(p02,p03,t); + const Vec4vf p20 = lerp(p10,p11,t); + const Vec4vf p21 = lerp(p11,p12,t); + const Vec4vf p30 = lerp(p20,p21,t); + + p = p30; + dp = vfloat(3.0f)*(p21-p20); + } + + template> + __forceinline Vec eval0(const int ofs, const int size) const + { + assert(size <= PrecomputedBezierBasis::N); + assert(ofs <= size); + return madd(vfloat::loadu(&bezier_basis0.c0[size][ofs]), Vec(v0), + madd(vfloat::loadu(&bezier_basis0.c1[size][ofs]), Vec(v1), + madd(vfloat::loadu(&bezier_basis0.c2[size][ofs]), Vec(v2), + vfloat::loadu(&bezier_basis0.c3[size][ofs]) * Vec(v3)))); + } + + template> + __forceinline Vec eval1(const int ofs, const int size) const + { + assert(size <= PrecomputedBezierBasis::N); + assert(ofs <= size); + return madd(vfloat::loadu(&bezier_basis1.c0[size][ofs]), Vec(v0), + madd(vfloat::loadu(&bezier_basis1.c1[size][ofs]), Vec(v1), + madd(vfloat::loadu(&bezier_basis1.c2[size][ofs]), Vec(v2), + vfloat::loadu(&bezier_basis1.c3[size][ofs]) * Vec(v3)))); + } + + template> + __forceinline Vec derivative0(const int ofs, const int size) const + { + assert(size <= PrecomputedBezierBasis::N); + assert(ofs <= size); + return madd(vfloat::loadu(&bezier_basis0.d0[size][ofs]), Vec(v0), + madd(vfloat::loadu(&bezier_basis0.d1[size][ofs]), Vec(v1), + madd(vfloat::loadu(&bezier_basis0.d2[size][ofs]), Vec(v2), + vfloat::loadu(&bezier_basis0.d3[size][ofs]) * Vec(v3)))); + } + + template> + __forceinline Vec derivative1(const int ofs, const int size) const + { + assert(size <= PrecomputedBezierBasis::N); + assert(ofs <= size); + return madd(vfloat::loadu(&bezier_basis1.d0[size][ofs]), Vec(v0), + madd(vfloat::loadu(&bezier_basis1.d1[size][ofs]), Vec(v1), + madd(vfloat::loadu(&bezier_basis1.d2[size][ofs]), Vec(v2), + vfloat::loadu(&bezier_basis1.d3[size][ofs]) * Vec(v3)))); + } + + /* calculates bounds of bezier curve geometry */ + __forceinline BBox3fa accurateBounds() const + { + const int N = 7; + const float scale = 1.0f/(3.0f*(N-1)); + Vec3vfx pl(pos_inf), pu(neg_inf); + for (int i=0; i<=N; i+=VSIZEX) + { + vintx vi = vintx(i)+vintx(step); + vboolx valid = vi <= vintx(N); + const Vec3vfx p = eval0>(i,N); + const Vec3vfx dp = derivative0>(i,N); + const Vec3vfx pm = p-Vec3vfx(scale)*select(vi!=vintx(0),dp,Vec3vfx(zero)); + const Vec3vfx pp = p+Vec3vfx(scale)*select(vi!=vintx(N),dp,Vec3vfx(zero)); + pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min + pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min + } + const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); + const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); + return BBox3fa(lower,upper); + } + + /* calculates bounds of bezier curve geometry */ + __forceinline BBox3fa accurateRoundBounds() const + { + const int N = 7; + const float scale = 1.0f/(3.0f*(N-1)); + Vec4vfx pl(pos_inf), pu(neg_inf); + for (int i=0; i<=N; i+=VSIZEX) + { + vintx vi = vintx(i)+vintx(step); + vboolx valid = vi <= vintx(N); + const Vec4vfx p = eval0(i,N); + const Vec4vfx dp = derivative0(i,N); + const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero)); + const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero)); + pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min + pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min + } + const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); + const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); + const float r_min = reduce_min(pl.w); + const float r_max = reduce_max(pu.w); + const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max))); + return enlarge(BBox3fa(lower,upper),upper_r); + } + + /* calculates bounds when tessellated into N line segments */ + __forceinline BBox3fa accurateFlatBounds(int N) const + { + if (likely(N == 4)) + { + const Vec4vf4 pi = eval0<4>(0,4); + const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z)); + const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z)); + const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w))); + return enlarge(BBox3fa(min(lower,v3),max(upper,v3)),max(upper_r,Vec3fa(abs(v3.w)))); + } + else + { + Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f); + for (int i=0; i(i,N); + + pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min + pl.y = select(valid,min(pl.y,pi.y),pl.y); + pl.z = select(valid,min(pl.z,pi.z),pl.z); + + pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min + pu.y = select(valid,max(pu.y,pi.y),pu.y); + pu.z = select(valid,max(pu.z,pi.z),pu.z); + + ru = select(valid,max(ru,abs(pi.w)),ru); + } + const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); + const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); + const Vec3fa upper_r(reduce_max(ru)); + return enlarge(BBox3fa(min(lower,v3),max(upper,v3)),max(upper_r,Vec3fa(abs(v3.w)))); + } + } + + friend __forceinline embree_ostream operator<<(embree_ostream cout, const CubicBezierCurve& curve) { + return cout << "CubicBezierCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }"; + } + }; + +#if defined(__AVX__) + template<> + __forceinline CubicBezierCurve CubicBezierCurve::clip(const Interval1f& u1) const + { + const vfloat8 p00 = vfloat8(v0); + const vfloat8 p01 = vfloat8(v1); + const vfloat8 p02 = vfloat8(v2); + const vfloat8 p03 = vfloat8(v3); + + const vfloat8 t(vfloat4(u1.lower),vfloat4(u1.upper)); + const vfloat8 p10 = lerp(p00,p01,t); + const vfloat8 p11 = lerp(p01,p02,t); + const vfloat8 p12 = lerp(p02,p03,t); + const vfloat8 p20 = lerp(p10,p11,t); + const vfloat8 p21 = lerp(p11,p12,t); + const vfloat8 p30 = lerp(p20,p21,t); + + const vfloat8 f01 = p30; + const vfloat8 df01 = vfloat8(3.0f)*(p21-p20); + + const vfloat4 f0 = extract4<0>(f01), f1 = extract4<1>(f01); + const vfloat4 df0 = extract4<0>(df01), df1 = extract4<1>(df01); + const float s = u1.upper-u1.lower; + return CubicBezierCurve(f0,f0+s*(1.0f/3.0f)*df0,f1-s*(1.0f/3.0f)*df1,f1); + } +#endif + + template using BezierCurveT = CubicBezierCurve; + + typedef CubicBezierCurve CubicBezierCurve1f; + typedef CubicBezierCurve CubicBezierCurve2fa; + typedef CubicBezierCurve CubicBezierCurve3fa; + typedef CubicBezierCurve BezierCurve3fa; + + template<> __forceinline int CubicBezierCurve::maxRoots() const + { + float eps = 1E-4f; + bool neg0 = v0 <= 0.0f; bool zero0 = fabs(v0) < eps; + bool neg1 = v1 <= 0.0f; bool zero1 = fabs(v1) < eps; + bool neg2 = v2 <= 0.0f; bool zero2 = fabs(v2) < eps; + bool neg3 = v3 <= 0.0f; bool zero3 = fabs(v3) < eps; + return (neg0 != neg1 || zero0) + (neg1 != neg2 || zero1) + (neg2 != neg3 || zero2 || zero3); + } + + template<> __forceinline int CubicBezierCurve::maxRoots() const { + return numRoots(v0,v1) + numRoots(v1,v2) + numRoots(v2,v3); + } + + __forceinline CubicBezierCurve enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CubicBezierCurve& curve) + { + return CubicBezierCurve(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3)); + } +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bezier_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/bezier_patch.h new file mode 100644 index 00000000000..d87ed41ccb1 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/bezier_patch.h @@ -0,0 +1,372 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "catmullclark_patch.h" +#include "bezier_curve.h" + +namespace embree +{ + template + static __forceinline T deCasteljau(const S& uu, const T& v0, const T& v1, const T& v2, const T& v3) + { + const T v0_1 = lerp(v0,v1,uu); + const T v1_1 = lerp(v1,v2,uu); + const T v2_1 = lerp(v2,v3,uu); + const T v0_2 = lerp(v0_1,v1_1,uu); + const T v1_2 = lerp(v1_1,v2_1,uu); + const T v0_3 = lerp(v0_2,v1_2,uu); + return v0_3; + } + + template + static __forceinline T deCasteljau_tangent(const S& uu, const T& v0, const T& v1, const T& v2, const T& v3) + { + const T v0_1 = lerp(v0,v1,uu); + const T v1_1 = lerp(v1,v2,uu); + const T v2_1 = lerp(v2,v3,uu); + const T v0_2 = lerp(v0_1,v1_1,uu); + const T v1_2 = lerp(v1_1,v2_1,uu); + return S(3.0f)*(v1_2-v0_2); + } + + template + __forceinline Vertex computeInnerBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { + return 1.0f / 36.0f * (16.0f * v[y][x] + 4.0f * (v[y-1][x] + v[y+1][x] + v[y][x-1] + v[y][x+1]) + (v[y-1][x-1] + v[y+1][x+1] + v[y-1][x+1] + v[y+1][x-1])); + } + + template + __forceinline Vertex computeTopEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { + return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y-1][x] + 2.0f * (v[y][x-1] + v[y][x+1]) + (v[y-1][x-1] + v[y-1][x+1])); + } + + template + __forceinline Vertex computeBottomEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { + return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y+1][x] + 2.0f * (v[y][x-1] + v[y][x+1]) + v[y+1][x-1] + v[y+1][x+1]); + } + + template + __forceinline Vertex computeLeftEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { + return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y][x-1] + 2.0f * (v[y-1][x] + v[y+1][x]) + v[y-1][x-1] + v[y+1][x-1]); + } + + template + __forceinline Vertex computeRightEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) { + return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y][x+1] + 2.0f * (v[y-1][x] + v[y+1][x]) + v[y-1][x+1] + v[y+1][x+1]); + } + + template + __forceinline Vertex computeCornerBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x, const ssize_t delta_y, const ssize_t delta_x) + { + return 1.0f / 9.0f * (4.0f * v[y][x] + 2.0f * (v[y+delta_y][x] + v[y][x+delta_x]) + v[y+delta_y][x+delta_x]); + } + + template + class __aligned(64) BezierPatchT + { + public: + Vertex matrix[4][4]; + + public: + + __forceinline BezierPatchT() {} + + __forceinline BezierPatchT (const HalfEdge* edge, const char* vertices, size_t stride); + + __forceinline BezierPatchT(const CatmullClarkPatchT& patch); + + __forceinline BezierPatchT(const CatmullClarkPatchT& patch, + const BezierCurveT* border0, + const BezierCurveT* border1, + const BezierCurveT* border2, + const BezierCurveT* border3); + + __forceinline BezierPatchT(const BSplinePatchT& source) + { + /* compute inner bezier control points */ + matrix[0][0] = computeInnerBezierControlPoint(source.v,1,1); + matrix[0][3] = computeInnerBezierControlPoint(source.v,1,2); + matrix[3][3] = computeInnerBezierControlPoint(source.v,2,2); + matrix[3][0] = computeInnerBezierControlPoint(source.v,2,1); + + /* compute top edge control points */ + matrix[0][1] = computeRightEdgeBezierControlPoint(source.v,1,1); + matrix[0][2] = computeLeftEdgeBezierControlPoint(source.v,1,2); + + /* compute buttom edge control points */ + matrix[3][1] = computeRightEdgeBezierControlPoint(source.v,2,1); + matrix[3][2] = computeLeftEdgeBezierControlPoint(source.v,2,2); + + /* compute left edge control points */ + matrix[1][0] = computeBottomEdgeBezierControlPoint(source.v,1,1); + matrix[2][0] = computeTopEdgeBezierControlPoint(source.v,2,1); + + /* compute right edge control points */ + matrix[1][3] = computeBottomEdgeBezierControlPoint(source.v,1,2); + matrix[2][3] = computeTopEdgeBezierControlPoint(source.v,2,2); + + /* compute corner control points */ + matrix[1][1] = computeCornerBezierControlPoint(source.v,1,1, 1, 1); + matrix[1][2] = computeCornerBezierControlPoint(source.v,1,2, 1,-1); + matrix[2][2] = computeCornerBezierControlPoint(source.v,2,2,-1,-1); + matrix[2][1] = computeCornerBezierControlPoint(source.v,2,1,-1, 1); + } + + static __forceinline Vertex_t bilinear(const Vec4f Bu, const Vertex matrix[4][4], const Vec4f Bv) + { + const Vertex_t M0 = madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))); + const Vertex_t M1 = madd(Bu.x,matrix[1][0],madd(Bu.y,matrix[1][1],madd(Bu.z,matrix[1][2],Bu.w * matrix[1][3]))); + const Vertex_t M2 = madd(Bu.x,matrix[2][0],madd(Bu.y,matrix[2][1],madd(Bu.z,matrix[2][2],Bu.w * matrix[2][3]))); + const Vertex_t M3 = madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3]))); + return madd(Bv.x,M0,madd(Bv.y,M1,madd(Bv.z,M2,Bv.w*M3))); + } + + static __forceinline Vertex_t eval(const Vertex matrix[4][4], const float uu, const float vv) + { + const Vec4f Bu = BezierBasis::eval(uu); + const Vec4f Bv = BezierBasis::eval(vv); + return bilinear(Bu,matrix,Bv); + } + + static __forceinline Vertex_t eval_du(const Vertex matrix[4][4], const float uu, const float vv) + { + const Vec4f Bu = BezierBasis::derivative(uu); + const Vec4f Bv = BezierBasis::eval(vv); + return bilinear(Bu,matrix,Bv); + } + + static __forceinline Vertex_t eval_dv(const Vertex matrix[4][4], const float uu, const float vv) + { + const Vec4f Bu = BezierBasis::eval(uu); + const Vec4f Bv = BezierBasis::derivative(vv); + return bilinear(Bu,matrix,Bv); + } + + static __forceinline Vertex_t eval_dudu(const Vertex matrix[4][4], const float uu, const float vv) + { + const Vec4f Bu = BezierBasis::derivative2(uu); + const Vec4f Bv = BezierBasis::eval(vv); + return bilinear(Bu,matrix,Bv); + } + + static __forceinline Vertex_t eval_dvdv(const Vertex matrix[4][4], const float uu, const float vv) + { + const Vec4f Bu = BezierBasis::eval(uu); + const Vec4f Bv = BezierBasis::derivative2(vv); + return bilinear(Bu,matrix,Bv); + } + + static __forceinline Vertex_t eval_dudv(const Vertex matrix[4][4], const float uu, const float vv) + { + const Vec4f Bu = BezierBasis::derivative(uu); + const Vec4f Bv = BezierBasis::derivative(vv); + return bilinear(Bu,matrix,Bv); + } + + static __forceinline Vertex_t normal(const Vertex matrix[4][4], const float uu, const float vv) + { + const Vertex_t dPdu = eval_du(matrix,uu,vv); + const Vertex_t dPdv = eval_dv(matrix,uu,vv); + return cross(dPdu,dPdv); + } + + __forceinline Vertex_t normal(const float uu, const float vv) + { + const Vertex_t dPdu = eval_du(matrix,uu,vv); + const Vertex_t dPdv = eval_dv(matrix,uu,vv); + return cross(dPdu,dPdv); + } + + __forceinline Vertex_t eval(const float uu, const float vv) const { + return eval(matrix,uu,vv); + } + + __forceinline Vertex_t eval_du(const float uu, const float vv) const { + return eval_du(matrix,uu,vv); + } + + __forceinline Vertex_t eval_dv(const float uu, const float vv) const { + return eval_dv(matrix,uu,vv); + } + + __forceinline Vertex_t eval_dudu(const float uu, const float vv) const { + return eval_dudu(matrix,uu,vv); + } + + __forceinline Vertex_t eval_dvdv(const float uu, const float vv) const { + return eval_dvdv(matrix,uu,vv); + } + + __forceinline Vertex_t eval_dudv(const float uu, const float vv) const { + return eval_dudv(matrix,uu,vv); + } + + __forceinline void eval(const float u, const float v, Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, const float dscale = 1.0f) const + { + if (P) { + *P = eval(u,v); + } + if (dPdu) { + assert(dPdu); *dPdu = eval_du(u,v)*dscale; + assert(dPdv); *dPdv = eval_dv(u,v)*dscale; + } + if (ddPdudu) { + assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); + assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); + assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); + } + } + + template + __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv, const Vec4& u_n, const Vec4& v_n) const + { + const vfloat curve0_x = v_n[0] * vfloat(matrix[0][0][i]) + v_n[1] * vfloat(matrix[1][0][i]) + v_n[2] * vfloat(matrix[2][0][i]) + v_n[3] * vfloat(matrix[3][0][i]); + const vfloat curve1_x = v_n[0] * vfloat(matrix[0][1][i]) + v_n[1] * vfloat(matrix[1][1][i]) + v_n[2] * vfloat(matrix[2][1][i]) + v_n[3] * vfloat(matrix[3][1][i]); + const vfloat curve2_x = v_n[0] * vfloat(matrix[0][2][i]) + v_n[1] * vfloat(matrix[1][2][i]) + v_n[2] * vfloat(matrix[2][2][i]) + v_n[3] * vfloat(matrix[3][2][i]); + const vfloat curve3_x = v_n[0] * vfloat(matrix[0][3][i]) + v_n[1] * vfloat(matrix[1][3][i]) + v_n[2] * vfloat(matrix[2][3][i]) + v_n[3] * vfloat(matrix[3][3][i]); + return u_n[0] * curve0_x + u_n[1] * curve1_x + u_n[2] * curve2_x + u_n[3] * curve3_x; + } + + template + __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, + const float dscale, const size_t dstride, const size_t N) const + { + if (P) { + const Vec4 u_n = BezierBasis::eval(uu); + const Vec4 v_n = BezierBasis::eval(vv); + for (size_t i=0; i u_n = BezierBasis::derivative(uu); + const Vec4 v_n = BezierBasis::eval(vv); + for (size_t i=0; i u_n = BezierBasis::eval(uu); + const Vec4 v_n = BezierBasis::derivative(vv); + for (size_t i=0; i u_n = BezierBasis::derivative2(uu); + const Vec4 v_n = BezierBasis::eval(vv); + for (size_t i=0; i u_n = BezierBasis::eval(uu); + const Vec4 v_n = BezierBasis::derivative2(vv); + for (size_t i=0; i u_n = BezierBasis::derivative(uu); + const Vec4 v_n = BezierBasis::derivative(vv); + for (size_t i=0; i + static __forceinline Vec3 eval(const Vertex matrix[4][4], const T& uu, const T& vv) + { + const T one_minus_uu = 1.0f - uu; + const T one_minus_vv = 1.0f - vv; + + const T B0_u = one_minus_uu * one_minus_uu * one_minus_uu; + const T B0_v = one_minus_vv * one_minus_vv * one_minus_vv; + const T B1_u = 3.0f * (one_minus_uu * uu * one_minus_uu); + const T B1_v = 3.0f * (one_minus_vv * vv * one_minus_vv); + const T B2_u = 3.0f * (uu * one_minus_uu * uu); + const T B2_v = 3.0f * (vv * one_minus_vv * vv); + const T B3_u = uu * uu * uu; + const T B3_v = vv * vv * vv; + + const T x = + madd(B0_v,madd(B0_u,matrix[0][0].x,madd(B1_u,matrix[0][1].x,madd(B2_u,matrix[0][2].x,B3_u*matrix[0][3].x))), + madd(B1_v,madd(B0_u,matrix[1][0].x,madd(B1_u,matrix[1][1].x,madd(B2_u,matrix[1][2].x,B3_u*matrix[1][3].x))), + madd(B2_v,madd(B0_u,matrix[2][0].x,madd(B1_u,matrix[2][1].x,madd(B2_u,matrix[2][2].x,B3_u*matrix[2][3].x))), + B3_v*madd(B0_u,matrix[3][0].x,madd(B1_u,matrix[3][1].x,madd(B2_u,matrix[3][2].x,B3_u*matrix[3][3].x)))))); + + const T y = + madd(B0_v,madd(B0_u,matrix[0][0].y,madd(B1_u,matrix[0][1].y,madd(B2_u,matrix[0][2].y,B3_u*matrix[0][3].y))), + madd(B1_v,madd(B0_u,matrix[1][0].y,madd(B1_u,matrix[1][1].y,madd(B2_u,matrix[1][2].y,B3_u*matrix[1][3].y))), + madd(B2_v,madd(B0_u,matrix[2][0].y,madd(B1_u,matrix[2][1].y,madd(B2_u,matrix[2][2].y,B3_u*matrix[2][3].y))), + B3_v*madd(B0_u,matrix[3][0].y,madd(B1_u,matrix[3][1].y,madd(B2_u,matrix[3][2].y,B3_u*matrix[3][3].y)))))); + + const T z = + madd(B0_v,madd(B0_u,matrix[0][0].z,madd(B1_u,matrix[0][1].z,madd(B2_u,matrix[0][2].z,B3_u*matrix[0][3].z))), + madd(B1_v,madd(B0_u,matrix[1][0].z,madd(B1_u,matrix[1][1].z,madd(B2_u,matrix[1][2].z,B3_u*matrix[1][3].z))), + madd(B2_v,madd(B0_u,matrix[2][0].z,madd(B1_u,matrix[2][1].z,madd(B2_u,matrix[2][2].z,B3_u*matrix[2][3].z))), + B3_v*madd(B0_u,matrix[3][0].z,madd(B1_u,matrix[3][1].z,madd(B2_u,matrix[3][2].z,B3_u*matrix[3][3].z)))))); + + return Vec3(x,y,z); + } + + template + __forceinline Vec3 eval(const vfloat& uu, const vfloat& vv) const { + return eval(matrix,uu,vv); + } + + template + static __forceinline Vec3 normal(const Vertex matrix[4][4], const T& uu, const T& vv) + { + + const Vec3 matrix_00 = Vec3(matrix[0][0].x,matrix[0][0].y,matrix[0][0].z); + const Vec3 matrix_01 = Vec3(matrix[0][1].x,matrix[0][1].y,matrix[0][1].z); + const Vec3 matrix_02 = Vec3(matrix[0][2].x,matrix[0][2].y,matrix[0][2].z); + const Vec3 matrix_03 = Vec3(matrix[0][3].x,matrix[0][3].y,matrix[0][3].z); + + const Vec3 matrix_10 = Vec3(matrix[1][0].x,matrix[1][0].y,matrix[1][0].z); + const Vec3 matrix_11 = Vec3(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z); + const Vec3 matrix_12 = Vec3(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z); + const Vec3 matrix_13 = Vec3(matrix[1][3].x,matrix[1][3].y,matrix[1][3].z); + + const Vec3 matrix_20 = Vec3(matrix[2][0].x,matrix[2][0].y,matrix[2][0].z); + const Vec3 matrix_21 = Vec3(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z); + const Vec3 matrix_22 = Vec3(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z); + const Vec3 matrix_23 = Vec3(matrix[2][3].x,matrix[2][3].y,matrix[2][3].z); + + const Vec3 matrix_30 = Vec3(matrix[3][0].x,matrix[3][0].y,matrix[3][0].z); + const Vec3 matrix_31 = Vec3(matrix[3][1].x,matrix[3][1].y,matrix[3][1].z); + const Vec3 matrix_32 = Vec3(matrix[3][2].x,matrix[3][2].y,matrix[3][2].z); + const Vec3 matrix_33 = Vec3(matrix[3][3].x,matrix[3][3].y,matrix[3][3].z); + + /* tangentU */ + const Vec3 col0 = deCasteljau(vv, matrix_00, matrix_10, matrix_20, matrix_30); + const Vec3 col1 = deCasteljau(vv, matrix_01, matrix_11, matrix_21, matrix_31); + const Vec3 col2 = deCasteljau(vv, matrix_02, matrix_12, matrix_22, matrix_32); + const Vec3 col3 = deCasteljau(vv, matrix_03, matrix_13, matrix_23, matrix_33); + + const Vec3 tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3); + + /* tangentV */ + const Vec3 row0 = deCasteljau(uu, matrix_00, matrix_01, matrix_02, matrix_03); + const Vec3 row1 = deCasteljau(uu, matrix_10, matrix_11, matrix_12, matrix_13); + const Vec3 row2 = deCasteljau(uu, matrix_20, matrix_21, matrix_22, matrix_23); + const Vec3 row3 = deCasteljau(uu, matrix_30, matrix_31, matrix_32, matrix_33); + + const Vec3 tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3); + + /* normal = tangentU x tangentV */ + const Vec3 n = cross(tangentU,tangentV); + return n; + } + + template + __forceinline Vec3 normal(const vfloat& uu, const vfloat& vv) const { + return normal(matrix,uu,vv); + } + }; + + typedef BezierPatchT BezierPatch3fa; +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bilinear_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/bilinear_patch.h new file mode 100644 index 00000000000..35748754bd0 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/bilinear_patch.h @@ -0,0 +1,191 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "catmullclark_patch.h" +#include "bezier_curve.h" + +namespace embree +{ + template + class __aligned(64) BilinearPatchT + { + typedef CatmullClark1RingT CatmullClarkRing; + typedef CatmullClarkPatchT CatmullClarkPatch; + + public: + Vertex v[4]; + + public: + + __forceinline BilinearPatchT () {} + + __forceinline BilinearPatchT (const HalfEdge* edge, const BufferView& vertices) { + init(edge,vertices.getPtr(),vertices.getStride()); + } + + __forceinline BilinearPatchT (const HalfEdge* edge, const char* vertices, size_t stride) { + init(edge,vertices,stride); + } + + __forceinline void init (const HalfEdge* edge, const char* vertices, size_t stride) + { + v[0] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next(); + v[1] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next(); + v[2] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next(); + v[3] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next(); + } + + __forceinline BilinearPatchT (const CatmullClarkPatch& patch) + { + v[0] = patch.ring[0].getLimitVertex(); + v[1] = patch.ring[1].getLimitVertex(); + v[2] = patch.ring[2].getLimitVertex(); + v[3] = patch.ring[3].getLimitVertex(); + } + + __forceinline BBox bounds() const + { + + BBox bounds (v[0]); + bounds.extend(v[1]); + bounds.extend(v[2]); + bounds.extend(v[3]); + return bounds; + } + + __forceinline Vertex eval(const float uu, const float vv) const { + return lerp(lerp(v[0],v[1],uu),lerp(v[3],v[2],uu),vv); + } + + __forceinline Vertex eval_du(const float uu, const float vv) const { + return lerp(v[1]-v[0],v[2]-v[3],vv); + } + + __forceinline Vertex eval_dv(const float uu, const float vv) const { + return lerp(v[3]-v[0],v[2]-v[1],uu); + } + + __forceinline Vertex eval_dudu(const float uu, const float vv) const { + return Vertex(zero); + } + + __forceinline Vertex eval_dvdv(const float uu, const float vv) const { + return Vertex(zero); + } + + __forceinline Vertex eval_dudv(const float uu, const float vv) const { + return (v[2]-v[3]) - (v[1]-v[0]); + } + + __forceinline Vertex normal(const float uu, const float vv) const { + return cross(eval_du(uu,vv),eval_dv(uu,vv)); + } + + __forceinline void eval(const float u, const float v, + Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, + const float dscale = 1.0f) const + { + if (P) { + *P = eval(u,v); + } + if (dPdu) { + assert(dPdu); *dPdu = eval_du(u,v)*dscale; + assert(dPdv); *dPdv = eval_dv(u,v)*dscale; + } + if (ddPdudu) { + assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); + assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); + assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); + } + } + + template + __forceinline Vec3 eval(const vfloat& uu, const vfloat& vv) const + { + const vfloat x = lerp(lerp(v[0].x,v[1].x,uu),lerp(v[3].x,v[2].x,uu),vv); + const vfloat y = lerp(lerp(v[0].y,v[1].y,uu),lerp(v[3].y,v[2].y,uu),vv); + const vfloat z = lerp(lerp(v[0].z,v[1].z,uu),lerp(v[3].z,v[2].z,uu),vv); + return Vec3(x,y,z); + } + + template + __forceinline Vec3 eval_du(const vfloat& uu, const vfloat& vv) const + { + const vfloat x = lerp(v[1].x-v[0].x,v[2].x-v[3].x,vv); + const vfloat y = lerp(v[1].y-v[0].y,v[2].y-v[3].y,vv); + const vfloat z = lerp(v[1].z-v[0].z,v[2].z-v[3].z,vv); + return Vec3(x,y,z); + } + + template + __forceinline Vec3 eval_dv(const vfloat& uu, const vfloat& vv) const + { + const vfloat x = lerp(v[3].x-v[0].x,v[2].x-v[1].x,uu); + const vfloat y = lerp(v[3].y-v[0].y,v[2].y-v[1].y,uu); + const vfloat z = lerp(v[3].z-v[0].z,v[2].z-v[1].z,uu); + return Vec3(x,y,z); + } + + template + __forceinline Vec3 normal(const vfloat& uu, const vfloat& vv) const { + return cross(eval_du(uu,vv),eval_dv(uu,vv)); + } + + template + __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv) const { + return lerp(lerp(v[0][i],v[1][i],uu),lerp(v[3][i],v[2][i],uu),vv); + } + + template + __forceinline vfloat eval_du(const size_t i, const vfloat& uu, const vfloat& vv) const { + return lerp(v[1][i]-v[0][i],v[2][i]-v[3][i],vv); + } + + template + __forceinline vfloat eval_dv(const size_t i, const vfloat& uu, const vfloat& vv) const { + return lerp(v[3][i]-v[0][i],v[2][i]-v[1][i],uu); + } + + template + __forceinline vfloat eval_dudu(const size_t i, const vfloat& uu, const vfloat& vv) const { + return vfloat(zero); + } + + template + __forceinline vfloat eval_dvdv(const size_t i, const vfloat& uu, const vfloat& vv) const { + return vfloat(zero); + } + + template + __forceinline vfloat eval_dudv(const size_t i, const vfloat& uu, const vfloat& vv) const { + return (v[2][i]-v[3][i]) - (v[1][i]-v[0][i]); + } + + template + __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, + const float dscale, const size_t dstride, const size_t N) const + { + if (P) { + for (size_t i=0; i BilinearPatch3fa; +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bspline_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/bspline_curve.h new file mode 100644 index 00000000000..a3256673282 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/bspline_curve.h @@ -0,0 +1,319 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "bezier_curve.h" + +namespace embree +{ + class BSplineBasis + { + public: + + template + static __forceinline Vec4 eval(const T& u) + { + const T t = u; + const T s = T(1.0f) - u; + const T n0 = s*s*s; + const T n1 = (4.0f*(s*s*s)+(t*t*t)) + (12.0f*((s*t)*s) + 6.0f*((t*s)*t)); + const T n2 = (4.0f*(t*t*t)+(s*s*s)) + (12.0f*((t*s)*t) + 6.0f*((s*t)*s)); + const T n3 = t*t*t; + return T(1.0f/6.0f)*Vec4(n0,n1,n2,n3); + } + + template + static __forceinline Vec4 derivative(const T& u) + { + const T t = u; + const T s = 1.0f - u; + const T n0 = -s*s; + const T n1 = -t*t - 4.0f*(t*s); + const T n2 = s*s + 4.0f*(s*t); + const T n3 = t*t; + return T(0.5f)*Vec4(n0,n1,n2,n3); + } + + template + static __forceinline Vec4 derivative2(const T& u) + { + const T t = u; + const T s = 1.0f - u; + const T n0 = s; + const T n1 = t - 2.0f*s; + const T n2 = s - 2.0f*t; + const T n3 = t; + return Vec4(n0,n1,n2,n3); + } + }; + + struct PrecomputedBSplineBasis + { + enum { N = 16 }; + public: + PrecomputedBSplineBasis() {} + PrecomputedBSplineBasis(int shift); + + /* basis for bspline evaluation */ + public: + float c0[N+1][N+1]; + float c1[N+1][N+1]; + float c2[N+1][N+1]; + float c3[N+1][N+1]; + + /* basis for bspline derivative evaluation */ + public: + float d0[N+1][N+1]; + float d1[N+1][N+1]; + float d2[N+1][N+1]; + float d3[N+1][N+1]; + }; + extern PrecomputedBSplineBasis bspline_basis0; + extern PrecomputedBSplineBasis bspline_basis1; + + template + struct BSplineCurveT + { + Vertex v0,v1,v2,v3; + + __forceinline BSplineCurveT() {} + + __forceinline BSplineCurveT(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3) + : v0(v0), v1(v1), v2(v2), v3(v3) {} + + __forceinline Vertex begin() const { + return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2)); + } + + __forceinline Vertex end() const { + return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3)); + } + + __forceinline Vertex center() const { + return 0.25f*(v0+v1+v2+v3); + } + + __forceinline BBox bounds() const { + return merge(BBox(v0),BBox(v1),BBox(v2),BBox(v3)); + } + + __forceinline friend BSplineCurveT operator -( const BSplineCurveT& a, const Vertex& b ) { + return BSplineCurveT(a.v0-b,a.v1-b,a.v2-b,a.v3-b); + } + + __forceinline BSplineCurveT xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const + { + const Vec3ff q0(xfmVector(space,(Vec3fa)v0-p), v0.w); + const Vec3ff q1(xfmVector(space,(Vec3fa)v1-p), v1.w); + const Vec3ff q2(xfmVector(space,(Vec3fa)v2-p), v2.w); + const Vec3ff q3(xfmVector(space,(Vec3fa)v3-p), v3.w); + return BSplineCurveT(q0,q1,q2,q3); + } + + __forceinline Vertex eval(const float t) const + { + const Vec4 b = BSplineBasis::eval(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline Vertex eval_du(const float t) const + { + const Vec4 b = BSplineBasis::derivative(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline Vertex eval_dudu(const float t) const + { + const Vec4 b = BSplineBasis::derivative2(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const + { + p = eval(t); + dp = eval_du(t); + ddp = eval_dudu(t); + } + + template + __forceinline Vec4vf veval(const vfloat& t) const + { + const Vec4vf b = BSplineBasis::eval(t); + return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); + } + + template + __forceinline Vec4vf veval_du(const vfloat& t) const + { + const Vec4vf b = BSplineBasis::derivative(t); + return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); + } + + template + __forceinline Vec4vf veval_dudu(const vfloat& t) const + { + const Vec4vf b = BSplineBasis::derivative2(t); + return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); + } + + template + __forceinline void veval(const vfloat& t, Vec4vf& p, Vec4vf& dp) const + { + p = veval(t); + dp = veval_du(t); + } + + template + __forceinline Vec4vf eval0(const int ofs, const int size) const + { + assert(size <= PrecomputedBSplineBasis::N); + assert(ofs <= size); + return madd(vfloat::loadu(&bspline_basis0.c0[size][ofs]), Vec4vf(v0), + madd(vfloat::loadu(&bspline_basis0.c1[size][ofs]), Vec4vf(v1), + madd(vfloat::loadu(&bspline_basis0.c2[size][ofs]), Vec4vf(v2), + vfloat::loadu(&bspline_basis0.c3[size][ofs]) * Vec4vf(v3)))); + } + + template + __forceinline Vec4vf eval1(const int ofs, const int size) const + { + assert(size <= PrecomputedBSplineBasis::N); + assert(ofs <= size); + return madd(vfloat::loadu(&bspline_basis1.c0[size][ofs]), Vec4vf(v0), + madd(vfloat::loadu(&bspline_basis1.c1[size][ofs]), Vec4vf(v1), + madd(vfloat::loadu(&bspline_basis1.c2[size][ofs]), Vec4vf(v2), + vfloat::loadu(&bspline_basis1.c3[size][ofs]) * Vec4vf(v3)))); + } + + template + __forceinline Vec4vf derivative0(const int ofs, const int size) const + { + assert(size <= PrecomputedBSplineBasis::N); + assert(ofs <= size); + return madd(vfloat::loadu(&bspline_basis0.d0[size][ofs]), Vec4vf(v0), + madd(vfloat::loadu(&bspline_basis0.d1[size][ofs]), Vec4vf(v1), + madd(vfloat::loadu(&bspline_basis0.d2[size][ofs]), Vec4vf(v2), + vfloat::loadu(&bspline_basis0.d3[size][ofs]) * Vec4vf(v3)))); + } + + template + __forceinline Vec4vf derivative1(const int ofs, const int size) const + { + assert(size <= PrecomputedBSplineBasis::N); + assert(ofs <= size); + return madd(vfloat::loadu(&bspline_basis1.d0[size][ofs]), Vec4vf(v0), + madd(vfloat::loadu(&bspline_basis1.d1[size][ofs]), Vec4vf(v1), + madd(vfloat::loadu(&bspline_basis1.d2[size][ofs]), Vec4vf(v2), + vfloat::loadu(&bspline_basis1.d3[size][ofs]) * Vec4vf(v3)))); + } + + /* calculates bounds of bspline curve geometry */ + __forceinline BBox3fa accurateRoundBounds() const + { + const int N = 7; + const float scale = 1.0f/(3.0f*(N-1)); + Vec4vfx pl(pos_inf), pu(neg_inf); + for (int i=0; i<=N; i+=VSIZEX) + { + vintx vi = vintx(i)+vintx(step); + vboolx valid = vi <= vintx(N); + const Vec4vfx p = eval0(i,N); + const Vec4vfx dp = derivative0(i,N); + const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero)); + const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero)); + pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min + pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min + } + const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); + const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); + const float r_min = reduce_min(pl.w); + const float r_max = reduce_max(pu.w); + const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max))); + return enlarge(BBox3fa(lower,upper),upper_r); + } + + /* calculates bounds when tessellated into N line segments */ + __forceinline BBox3fa accurateFlatBounds(int N) const + { + if (likely(N == 4)) + { + const Vec4vf4 pi = eval0<4>(0,4); + const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z)); + const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z)); + const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w))); + const Vec3ff pe = end(); + return enlarge(BBox3fa(min(lower,pe),max(upper,pe)),max(upper_r,Vec3fa(abs(pe.w)))); + } + else + { + Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f); + for (int i=0; i<=N; i+=VSIZEX) + { + vboolx valid = vintx(i)+vintx(step) <= vintx(N); + const Vec4vfx pi = eval0(i,N); + + pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min + pl.y = select(valid,min(pl.y,pi.y),pl.y); + pl.z = select(valid,min(pl.z,pi.z),pl.z); + + pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min + pu.y = select(valid,max(pu.y,pi.y),pu.y); + pu.z = select(valid,max(pu.z,pi.z),pu.z); + + ru = select(valid,max(ru,abs(pi.w)),ru); + } + const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); + const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); + const Vec3fa upper_r(reduce_max(ru)); + return enlarge(BBox3fa(lower,upper),upper_r); + } + } + + friend __forceinline embree_ostream operator<<(embree_ostream cout, const BSplineCurveT& curve) { + return cout << "BSplineCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }"; + } + }; + + template + __forceinline void convert(const BezierCurveT& icurve, BezierCurveT& ocurve) { + ocurve = icurve; + } + + template + __forceinline void convert(const BSplineCurveT& icurve, BSplineCurveT& ocurve) { + ocurve = icurve; + } + + template + __forceinline void convert(const BezierCurveT& icurve, BSplineCurveT& ocurve) + { + const Vertex v0 = madd(6.0f,icurve.v0,madd(-7.0f,icurve.v1,2.0f*icurve.v2)); + const Vertex v1 = msub(2.0f,icurve.v1,icurve.v2); + const Vertex v2 = msub(2.0f,icurve.v2,icurve.v1); + const Vertex v3 = madd(2.0f,icurve.v1,madd(-7.0f,icurve.v2,6.0f*icurve.v3)); + ocurve = BSplineCurveT(v0,v1,v2,v3); + } + + template + __forceinline void convert(const BSplineCurveT& icurve, BezierCurveT& ocurve) + { + const Vertex v0 = madd(1.0f/6.0f,icurve.v0,madd(2.0f/3.0f,icurve.v1,1.0f/6.0f*icurve.v2)); + const Vertex v1 = madd(2.0f/3.0f,icurve.v1,1.0f/3.0f*icurve.v2); + const Vertex v2 = madd(1.0f/3.0f,icurve.v1,2.0f/3.0f*icurve.v2); + const Vertex v3 = madd(1.0f/6.0f,icurve.v1,madd(2.0f/3.0f,icurve.v2,1.0f/6.0f*icurve.v3)); + ocurve = BezierCurveT(v0,v1,v2,v3); + } + + __forceinline BSplineCurveT enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const BSplineCurveT& curve) + { + return BSplineCurveT(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3)); + } + + typedef BSplineCurveT BSplineCurve3fa; +} + diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bspline_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/bspline_patch.h new file mode 100644 index 00000000000..9769bc17bdf --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/bspline_patch.h @@ -0,0 +1,449 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "catmullclark_patch.h" +#include "bspline_curve.h" + +namespace embree +{ + template + class __aligned(64) BSplinePatchT + { + typedef CatmullClark1RingT CatmullClarkRing; + typedef CatmullClarkPatchT CatmullClarkPatch; + + public: + + __forceinline BSplinePatchT () {} + + __forceinline BSplinePatchT (const CatmullClarkPatch& patch) { + init(patch); + } + + __forceinline BSplinePatchT(const CatmullClarkPatch& patch, + const BezierCurveT* border0, + const BezierCurveT* border1, + const BezierCurveT* border2, + const BezierCurveT* border3) + { + init(patch); + } + + __forceinline BSplinePatchT (const HalfEdge* edge, const char* vertices, size_t stride) { + init(edge,vertices,stride); + } + + __forceinline Vertex hard_corner(const Vertex& v01, const Vertex& v02, + const Vertex& v10, const Vertex& v11, const Vertex& v12, + const Vertex& v20, const Vertex& v21, const Vertex& v22) + { + return 4.0f*v11 - 2.0f*(v12+v21) + v22; + } + + __forceinline Vertex soft_convex_corner( const Vertex& v01, const Vertex& v02, + const Vertex& v10, const Vertex& v11, const Vertex& v12, + const Vertex& v20, const Vertex& v21, const Vertex& v22) + { + return -8.0f*v11 + 4.0f*(v12+v21) + v22; + } + + __forceinline Vertex convex_corner(const float vertex_crease_weight, + const Vertex& v01, const Vertex& v02, + const Vertex& v10, const Vertex& v11, const Vertex& v12, + const Vertex& v20, const Vertex& v21, const Vertex& v22) + { + if (std::isinf(vertex_crease_weight)) return hard_corner(v01,v02,v10,v11,v12,v20,v21,v22); + else return soft_convex_corner(v01,v02,v10,v11,v12,v20,v21,v22); + } + + __forceinline Vertex load(const HalfEdge* edge, const char* vertices, size_t stride) { + return Vertex_t::loadu(vertices+edge->getStartVertexIndex()*stride); + } + + __forceinline void init_border(const CatmullClarkRing& edge0, + Vertex& v01, Vertex& v02, + const Vertex& v11, const Vertex& v12, + const Vertex& v21, const Vertex& v22) + { + if (likely(edge0.has_opposite_back(0))) + { + v01 = edge0.back(2); + v02 = edge0.back(1); + } else { + v01 = 2.0f*v11-v21; + v02 = 2.0f*v12-v22; + } + } + + __forceinline void init_corner(const CatmullClarkRing& edge0, + Vertex& v00, const Vertex& v01, const Vertex& v02, + const Vertex& v10, const Vertex& v11, const Vertex& v12, + const Vertex& v20, const Vertex& v21, const Vertex& v22) + { + const bool MAYBE_UNUSED has_back1 = edge0.has_opposite_back(1); + const bool has_back0 = edge0.has_opposite_back(0); + const bool has_front1 = edge0.has_opposite_front(1); + const bool MAYBE_UNUSED has_front2 = edge0.has_opposite_front(2); + + if (likely(has_back0)) { + if (likely(has_front1)) { assert(has_back1 && has_front2); v00 = edge0.back(3); } + else { assert(!has_back1); v00 = 2.0f*v01-v02; } + } + else { + if (likely(has_front1)) { assert(!has_front2); v00 = 2.0f*v10-v20; } + else v00 = convex_corner(edge0.vertex_crease_weight,v01,v02,v10,v11,v12,v20,v21,v22); + } + } + + void init(const CatmullClarkPatch& patch) + { + /* fill inner vertices */ + const Vertex v11 = v[1][1] = patch.ring[0].vtx; + const Vertex v12 = v[1][2] = patch.ring[1].vtx; + const Vertex v22 = v[2][2] = patch.ring[2].vtx; + const Vertex v21 = v[2][1] = patch.ring[3].vtx; + + /* fill border vertices */ + init_border(patch.ring[0],v[0][1],v[0][2],v11,v12,v21,v22); + init_border(patch.ring[1],v[1][3],v[2][3],v12,v22,v11,v21); + init_border(patch.ring[2],v[3][2],v[3][1],v22,v21,v12,v11); + init_border(patch.ring[3],v[2][0],v[1][0],v21,v11,v22,v12); + + /* fill corner vertices */ + init_corner(patch.ring[0],v[0][0],v[0][1],v[0][2],v[1][0],v11,v12,v[2][0],v21,v22); + init_corner(patch.ring[1],v[0][3],v[1][3],v[2][3],v[0][2],v12,v22,v[0][1],v11,v21); + init_corner(patch.ring[2],v[3][3],v[3][2],v[3][1],v[2][3],v22,v21,v[1][3],v12,v11); + init_corner(patch.ring[3],v[3][0],v[2][0],v[1][0],v[3][1],v21,v11,v[3][2],v22,v12); + } + + void init_border(const HalfEdge* edge0, const char* vertices, size_t stride, + Vertex& v01, Vertex& v02, + const Vertex& v11, const Vertex& v12, + const Vertex& v21, const Vertex& v22) + { + if (likely(edge0->hasOpposite())) + { + const HalfEdge* e = edge0->opposite()->next()->next(); + v01 = load(e,vertices,stride); + v02 = load(e->next(),vertices,stride); + } else { + v01 = 2.0f*v11-v21; + v02 = 2.0f*v12-v22; + } + } + + void init_corner(const HalfEdge* edge0, const char* vertices, size_t stride, + Vertex& v00, const Vertex& v01, const Vertex& v02, + const Vertex& v10, const Vertex& v11, const Vertex& v12, + const Vertex& v20, const Vertex& v21, const Vertex& v22) + { + const bool has_back0 = edge0->hasOpposite(); + const bool has_front1 = edge0->prev()->hasOpposite(); + + if (likely(has_back0)) + { + const HalfEdge* e = edge0->opposite()->next(); + if (likely(has_front1)) + { + assert(e->hasOpposite()); + assert(edge0->prev()->opposite()->prev()->hasOpposite()); + v00 = load(e->opposite()->prev(),vertices,stride); + } + else { + assert(!e->hasOpposite()); + v00 = 2.0f*v01-v02; + } + } + else + { + if (likely(has_front1)) { + assert(!edge0->prev()->opposite()->prev()->hasOpposite()); + v00 = 2.0f*v10-v20; + } + else { + assert(edge0->vertex_crease_weight == 0.0f || std::isinf(edge0->vertex_crease_weight)); + v00 = convex_corner(edge0->vertex_crease_weight,v01,v02,v10,v11,v12,v20,v21,v22); + } + } + } + + void init(const HalfEdge* edge0, const char* vertices, size_t stride) + { + assert( edge0->isRegularFace() ); + + /* fill inner vertices */ + const Vertex v11 = v[1][1] = load(edge0,vertices,stride); const HalfEdge* edge1 = edge0->next(); + const Vertex v12 = v[1][2] = load(edge1,vertices,stride); const HalfEdge* edge2 = edge1->next(); + const Vertex v22 = v[2][2] = load(edge2,vertices,stride); const HalfEdge* edge3 = edge2->next(); + const Vertex v21 = v[2][1] = load(edge3,vertices,stride); assert(edge0 == edge3->next()); + + /* fill border vertices */ + init_border(edge0,vertices,stride,v[0][1],v[0][2],v11,v12,v21,v22); + init_border(edge1,vertices,stride,v[1][3],v[2][3],v12,v22,v11,v21); + init_border(edge2,vertices,stride,v[3][2],v[3][1],v22,v21,v12,v11); + init_border(edge3,vertices,stride,v[2][0],v[1][0],v21,v11,v22,v12); + + /* fill corner vertices */ + init_corner(edge0,vertices,stride,v[0][0],v[0][1],v[0][2],v[1][0],v11,v12,v[2][0],v21,v22); + init_corner(edge1,vertices,stride,v[0][3],v[1][3],v[2][3],v[0][2],v12,v22,v[0][1],v11,v21); + init_corner(edge2,vertices,stride,v[3][3],v[3][2],v[3][1],v[2][3],v22,v21,v[1][3],v12,v11); + init_corner(edge3,vertices,stride,v[3][0],v[2][0],v[1][0],v[3][1],v21,v11,v[3][2],v22,v12); + } + + __forceinline BBox bounds() const + { + const Vertex* const cv = &v[0][0]; + BBox bounds (cv[0]); + for (size_t i=1; i<16 ; i++) + bounds.extend( cv[i] ); + return bounds; + } + + __forceinline Vertex eval(const float uu, const float vv) const + { + const Vec4f v_n = BSplineBasis::eval(vv); + const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); + const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); + const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); + const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); + + const Vec4f u_n = BSplineBasis::eval(uu); + return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); + } + + __forceinline Vertex eval_du(const float uu, const float vv) const + { + const Vec4f v_n = BSplineBasis::eval(vv); + const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); + const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); + const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); + const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); + + const Vec4f u_n = BSplineBasis::derivative(uu); + return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); + } + + __forceinline Vertex eval_dv(const float uu, const float vv) const + { + const Vec4f v_n = BSplineBasis::derivative(vv); + const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); + const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); + const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); + const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); + + const Vec4f u_n = BSplineBasis::eval(uu); + return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); + } + + __forceinline Vertex eval_dudu(const float uu, const float vv) const + { + const Vec4f v_n = BSplineBasis::eval(vv); + const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); + const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); + const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); + const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); + + const Vec4f u_n = BSplineBasis::derivative2(uu); + return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); + } + + __forceinline Vertex eval_dvdv(const float uu, const float vv) const + { + const Vec4f v_n = BSplineBasis::derivative2(vv); + const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); + const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); + const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); + const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); + + const Vec4f u_n = BSplineBasis::eval(uu); + return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); + } + + __forceinline Vertex eval_dudv(const float uu, const float vv) const + { + const Vec4f v_n = BSplineBasis::derivative(vv); + const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0]))); + const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1]))); + const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2]))); + const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3]))); + + const Vec4f u_n = BSplineBasis::derivative(uu); + return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3))); + } + + __forceinline Vertex normal(const float uu, const float vv) const + { + const Vertex tu = eval_du(uu,vv); + const Vertex tv = eval_dv(uu,vv); + return cross(tu,tv); + } + + template + __forceinline Vec3 eval(const T& uu, const T& vv, const Vec4& u_n, const Vec4& v_n) const + { + const T curve0_x = madd(v_n[0],T(v[0][0].x),madd(v_n[1],T(v[1][0].x),madd(v_n[2],T(v[2][0].x),v_n[3] * T(v[3][0].x)))); + const T curve1_x = madd(v_n[0],T(v[0][1].x),madd(v_n[1],T(v[1][1].x),madd(v_n[2],T(v[2][1].x),v_n[3] * T(v[3][1].x)))); + const T curve2_x = madd(v_n[0],T(v[0][2].x),madd(v_n[1],T(v[1][2].x),madd(v_n[2],T(v[2][2].x),v_n[3] * T(v[3][2].x)))); + const T curve3_x = madd(v_n[0],T(v[0][3].x),madd(v_n[1],T(v[1][3].x),madd(v_n[2],T(v[2][3].x),v_n[3] * T(v[3][3].x)))); + const T x = madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x))); + + const T curve0_y = madd(v_n[0],T(v[0][0].y),madd(v_n[1],T(v[1][0].y),madd(v_n[2],T(v[2][0].y),v_n[3] * T(v[3][0].y)))); + const T curve1_y = madd(v_n[0],T(v[0][1].y),madd(v_n[1],T(v[1][1].y),madd(v_n[2],T(v[2][1].y),v_n[3] * T(v[3][1].y)))); + const T curve2_y = madd(v_n[0],T(v[0][2].y),madd(v_n[1],T(v[1][2].y),madd(v_n[2],T(v[2][2].y),v_n[3] * T(v[3][2].y)))); + const T curve3_y = madd(v_n[0],T(v[0][3].y),madd(v_n[1],T(v[1][3].y),madd(v_n[2],T(v[2][3].y),v_n[3] * T(v[3][3].y)))); + const T y = madd(u_n[0],curve0_y,madd(u_n[1],curve1_y,madd(u_n[2],curve2_y,u_n[3] * curve3_y))); + + const T curve0_z = madd(v_n[0],T(v[0][0].z),madd(v_n[1],T(v[1][0].z),madd(v_n[2],T(v[2][0].z),v_n[3] * T(v[3][0].z)))); + const T curve1_z = madd(v_n[0],T(v[0][1].z),madd(v_n[1],T(v[1][1].z),madd(v_n[2],T(v[2][1].z),v_n[3] * T(v[3][1].z)))); + const T curve2_z = madd(v_n[0],T(v[0][2].z),madd(v_n[1],T(v[1][2].z),madd(v_n[2],T(v[2][2].z),v_n[3] * T(v[3][2].z)))); + const T curve3_z = madd(v_n[0],T(v[0][3].z),madd(v_n[1],T(v[1][3].z),madd(v_n[2],T(v[2][3].z),v_n[3] * T(v[3][3].z)))); + const T z = madd(u_n[0],curve0_z,madd(u_n[1],curve1_z,madd(u_n[2],curve2_z,u_n[3] * curve3_z))); + + return Vec3(x,y,z); + } + + template + __forceinline Vec3 eval(const T& uu, const T& vv) const + { + const Vec4 u_n = BSplineBasis::eval(uu); + const Vec4 v_n = BSplineBasis::eval(vv); + return eval(uu,vv,u_n,v_n); + } + + template + __forceinline Vec3 eval_du(const T& uu, const T& vv) const + { + const Vec4 u_n = BSplineBasis::derivative(uu); + const Vec4 v_n = BSplineBasis::eval(vv); + return eval(uu,vv,u_n,v_n); + } + + template + __forceinline Vec3 eval_dv(const T& uu, const T& vv) const + { + const Vec4 u_n = BSplineBasis::eval(uu); + const Vec4 v_n = BSplineBasis::derivative(vv); + return eval(uu,vv,u_n,v_n); + } + + template + __forceinline Vec3 eval_dudu(const T& uu, const T& vv) const + { + const Vec4 u_n = BSplineBasis::derivative2(uu); + const Vec4 v_n = BSplineBasis::eval(vv); + return eval(uu,vv,u_n,v_n); + } + + template + __forceinline Vec3 eval_dvdv(const T& uu, const T& vv) const + { + const Vec4 u_n = BSplineBasis::eval(uu); + const Vec4 v_n = BSplineBasis::derivative2(vv); + return eval(uu,vv,u_n,v_n); + } + + template + __forceinline Vec3 eval_dudv(const T& uu, const T& vv) const + { + const Vec4 u_n = BSplineBasis::derivative(uu); + const Vec4 v_n = BSplineBasis::derivative(vv); + return eval(uu,vv,u_n,v_n); + } + + template + __forceinline Vec3 normal(const T& uu, const T& vv) const { + return cross(eval_du(uu,vv),eval_dv(uu,vv)); + } + + void eval(const float u, const float v, + Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, + const float dscale = 1.0f) const + { + if (P) { + *P = eval(u,v); + } + if (dPdu) { + assert(dPdu); *dPdu = eval_du(u,v)*dscale; + assert(dPdv); *dPdv = eval_dv(u,v)*dscale; + } + if (ddPdudu) { + assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); + assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); + assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); + } + } + + template + __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv, const Vec4& u_n, const Vec4& v_n) const + { + const vfloat curve0_x = madd(v_n[0],vfloat(v[0][0][i]),madd(v_n[1],vfloat(v[1][0][i]),madd(v_n[2],vfloat(v[2][0][i]),v_n[3] * vfloat(v[3][0][i])))); + const vfloat curve1_x = madd(v_n[0],vfloat(v[0][1][i]),madd(v_n[1],vfloat(v[1][1][i]),madd(v_n[2],vfloat(v[2][1][i]),v_n[3] * vfloat(v[3][1][i])))); + const vfloat curve2_x = madd(v_n[0],vfloat(v[0][2][i]),madd(v_n[1],vfloat(v[1][2][i]),madd(v_n[2],vfloat(v[2][2][i]),v_n[3] * vfloat(v[3][2][i])))); + const vfloat curve3_x = madd(v_n[0],vfloat(v[0][3][i]),madd(v_n[1],vfloat(v[1][3][i]),madd(v_n[2],vfloat(v[2][3][i]),v_n[3] * vfloat(v[3][3][i])))); + return madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x))); + } + + template + void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, + const float dscale, const size_t dstride, const size_t N) const + { + if (P) { + const Vec4 u_n = BSplineBasis::eval(uu); + const Vec4 v_n = BSplineBasis::eval(vv); + for (size_t i=0; i u_n = BSplineBasis::derivative(uu); + const Vec4 v_n = BSplineBasis::eval(vv); + for (size_t i=0; i u_n = BSplineBasis::eval(uu); + const Vec4 v_n = BSplineBasis::derivative(vv); + for (size_t i=0; i u_n = BSplineBasis::derivative2(uu); + const Vec4 v_n = BSplineBasis::eval(vv); + for (size_t i=0; i u_n = BSplineBasis::eval(uu); + const Vec4 v_n = BSplineBasis::derivative2(vv); + for (size_t i=0; i u_n = BSplineBasis::derivative(uu); + const Vec4 v_n = BSplineBasis::derivative(vv); + for (size_t i=0; i BSplinePatch3fa; +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_coefficients.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_coefficients.h new file mode 100644 index 00000000000..05031cf6b9a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_coefficients.h @@ -0,0 +1,85 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/geometry.h" + +namespace embree +{ + static const size_t MAX_PATCH_VALENCE = 16; //!< maximum number of vertices of a patch + static const size_t MAX_RING_FACE_VALENCE = 64; //!< maximum number of faces per ring + static const size_t MAX_RING_EDGE_VALENCE = 2*64; //!< maximum number of edges per ring + + class CatmullClarkPrecomputedCoefficients + { + private: + + float table_cos_2PI_div_n[MAX_RING_FACE_VALENCE+1]; + + float* table_limittangent_a[MAX_RING_FACE_VALENCE+1]; + float* table_limittangent_b[MAX_RING_FACE_VALENCE+1]; + float table_limittangent_c[MAX_RING_FACE_VALENCE+1]; + + __forceinline float set_cos_2PI_div_n(const size_t n) { + if (unlikely(n == 0)) return 1.0f; + return cosf(2.0f*float(pi)/(float)n); + } + + __forceinline float set_limittangent_a(const size_t i, const size_t n) + { + if (unlikely(n == 0)) return 1.0f; + const float c0 = 1.0f/(float)n * 1.0f / sqrtf(4.0f + cosf(float(pi)/(float)n)*cosf(float(pi)/(float)n)); + const float c1 = (1.0f/(float)n + cosf(float(pi)/(float)n) * c0); + return cosf(2.0f*float(pi)*(float)i/(float)n) * c1; + } + + __forceinline float set_limittangent_b(const size_t i, const size_t n) + { + if (unlikely(n == 0)) return 1.0f; + const float c0 = 1.0f/(float)n * 1.0f / sqrtf(4.0f + cosf(float(pi)/(float)n)*cosf(float(pi)/(float)n)); + return cosf((2.0f*float(pi)*i+float(pi))/(float)n) * c0; + } + + __forceinline float set_limittangent_c(const size_t n) + { + if (unlikely(n == 0)) return 1.0f; + return 2.0f/16.0f * (5.0f + cosf(2.0f*float(pi)/(float)n) + cosf(float(pi)/(float)n) * sqrtf(18.0f+2.0f*cosf(2.0f*float(pi)/(float)n))); + } + + public: + + __forceinline float cos_2PI_div_n(const size_t n) + { + if (likely(n <= MAX_RING_FACE_VALENCE)) + return table_cos_2PI_div_n[n]; + else + return set_cos_2PI_div_n(n); + } + + __forceinline float limittangent_a(const size_t i, const size_t n) + { + assert(n <= MAX_RING_FACE_VALENCE); + assert(i < n); + return table_limittangent_a[n][i]; + } + + __forceinline float limittangent_b(const size_t i, const size_t n) + { + assert(n <= MAX_RING_FACE_VALENCE); + assert(i < n); + return table_limittangent_b[n][i]; + } + + __forceinline float limittangent_c(const size_t n) + { + assert(n <= MAX_RING_FACE_VALENCE); + return table_limittangent_c[n]; + } + + static CatmullClarkPrecomputedCoefficients table; + + CatmullClarkPrecomputedCoefficients(); + ~CatmullClarkPrecomputedCoefficients(); + }; +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_patch.h new file mode 100644 index 00000000000..ab1d63594a0 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_patch.h @@ -0,0 +1,562 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "catmullclark_ring.h" +#include "bezier_curve.h" + +namespace embree +{ + template + class __aligned(64) CatmullClarkPatchT + { + public: + typedef CatmullClark1RingT CatmullClark1Ring; + typedef typename CatmullClark1Ring::Type Type; + + array_t,4> ring; + + public: + __forceinline CatmullClarkPatchT () {} + + __forceinline CatmullClarkPatchT (const HalfEdge* first_half_edge, const char* vertices, size_t stride) { + init(first_half_edge,vertices,stride); + } + + __forceinline CatmullClarkPatchT (const HalfEdge* first_half_edge, const BufferView& vertices) { + init(first_half_edge,vertices.getPtr(),vertices.getStride()); + } + + __forceinline void init (const HalfEdge* first_half_edge, const char* vertices, size_t stride) + { + for (unsigned i=0; i<4; i++) + ring[i].init(first_half_edge+i,vertices,stride); + + assert(verify()); + } + + __forceinline size_t bytes() const { + return ring[0].bytes()+ring[1].bytes()+ring[2].bytes()+ring[3].bytes(); + } + + __forceinline void serialize(void* ptr, size_t& ofs) const + { + for (size_t i=0; i<4; i++) + ring[i].serialize((char*)ptr,ofs); + } + + __forceinline void deserialize(void* ptr) + { + size_t ofs = 0; + for (size_t i=0; i<4; i++) + ring[i].deserialize((char*)ptr,ofs); + } + + __forceinline BBox3fa bounds() const + { + BBox3fa bounds (ring[0].bounds()); + for (size_t i=1; i<4; i++) + bounds.extend(ring[i].bounds()); + return bounds; + } + + __forceinline Type type() const + { + const int ty0 = ring[0].type() ^ CatmullClark1Ring::TYPE_CREASES; + const int ty1 = ring[1].type() ^ CatmullClark1Ring::TYPE_CREASES; + const int ty2 = ring[2].type() ^ CatmullClark1Ring::TYPE_CREASES; + const int ty3 = ring[3].type() ^ CatmullClark1Ring::TYPE_CREASES; + return (Type) ((ty0 & ty1 & ty2 & ty3) ^ CatmullClark1Ring::TYPE_CREASES); + } + + __forceinline bool isFinalResolution(float res) const { + return ring[0].isFinalResolution(res) && ring[1].isFinalResolution(res) && ring[2].isFinalResolution(res) && ring[3].isFinalResolution(res); + } + + static __forceinline void init_regular(const CatmullClark1RingT& p0, + const CatmullClark1RingT& p1, + CatmullClark1RingT& dest0, + CatmullClark1RingT& dest1) + { + assert(p1.face_valence > 2); + dest1.vertex_level = dest0.vertex_level = p0.edge_level; + dest1.face_valence = dest0.face_valence = 4; + dest1.edge_valence = dest0.edge_valence = 8; + dest1.border_index = dest0.border_index = -1; + dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0]; + dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f; + + dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1]; + dest1.ring[1] = dest0.ring[7] = (Vertex_t)p1.ring[0]; + dest1.ring[0] = dest0.ring[6] = (Vertex_t)p1.vtx; + dest1.ring[7] = dest0.ring[5] = (Vertex_t)p1.ring[4]; + dest1.ring[6] = dest0.ring[4] = (Vertex_t)p0.ring[p0.edge_valence-1]; + dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.edge_valence-2]; + dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx; + dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2]; + + dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f; + dest1.crease_weight[0] = dest0.crease_weight[3] = p1.crease_weight[1]; + dest1.crease_weight[3] = dest0.crease_weight[2] = 0.0f; + dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0]; + + if (p0.eval_unique_identifier <= p1.eval_unique_identifier) + { + dest0.eval_start_index = 3; + dest1.eval_start_index = 0; + dest0.eval_unique_identifier = p0.eval_unique_identifier; + dest1.eval_unique_identifier = p0.eval_unique_identifier; + } + else + { + dest0.eval_start_index = 1; + dest1.eval_start_index = 2; + dest0.eval_unique_identifier = p1.eval_unique_identifier; + dest1.eval_unique_identifier = p1.eval_unique_identifier; + } + } + + static __forceinline void init_border(const CatmullClark1RingT &p0, + const CatmullClark1RingT &p1, + CatmullClark1RingT &dest0, + CatmullClark1RingT &dest1) + { + dest1.vertex_level = dest0.vertex_level = p0.edge_level; + dest1.face_valence = dest0.face_valence = 3; + dest1.edge_valence = dest0.edge_valence = 6; + dest0.border_index = 2; + dest1.border_index = 4; + dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0]; + dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f; + + dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1]; + dest1.ring[1] = dest0.ring[5] = (Vertex_t)p1.ring[0]; + dest1.ring[0] = dest0.ring[4] = (Vertex_t)p1.vtx; + dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.border_index+1]; // dummy + dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx; + dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2]; + + dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f; + dest1.crease_weight[0] = dest0.crease_weight[2] = p1.crease_weight[1]; + dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0]; + + if (p0.eval_unique_identifier <= p1.eval_unique_identifier) + { + dest0.eval_start_index = 1; + dest1.eval_start_index = 2; + dest0.eval_unique_identifier = p0.eval_unique_identifier; + dest1.eval_unique_identifier = p0.eval_unique_identifier; + } + else + { + dest0.eval_start_index = 2; + dest1.eval_start_index = 0; + dest0.eval_unique_identifier = p1.eval_unique_identifier; + dest1.eval_unique_identifier = p1.eval_unique_identifier; + } + } + + static __forceinline void init_regular(const Vertex_t ¢er, const Vertex_t center_ring[8], const unsigned int offset, CatmullClark1RingT &dest) + { + dest.vertex_level = 0.0f; + dest.face_valence = 4; + dest.edge_valence = 8; + dest.border_index = -1; + dest.vtx = (Vertex_t)center; + dest.vertex_crease_weight = 0.0f; + for (size_t i=0; i<8; i++) + dest.ring[i] = (Vertex_t)center_ring[(offset+i)%8]; + for (size_t i=0; i<4; i++) + dest.crease_weight[i] = 0.0f; + + dest.eval_start_index = (8-offset)>>1; + if (dest.eval_start_index >= dest.face_valence) dest.eval_start_index -= dest.face_valence; + assert( dest.eval_start_index < dest.face_valence ); + dest.eval_unique_identifier = 0; + } + + __noinline void subdivide(array_t& patch) const + { + ring[0].subdivide(patch[0].ring[0]); + ring[1].subdivide(patch[1].ring[1]); + ring[2].subdivide(patch[2].ring[2]); + ring[3].subdivide(patch[3].ring[3]); + + patch[0].ring[0].edge_level = 0.5f*ring[0].edge_level; + patch[0].ring[1].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level); + patch[0].ring[2].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level); + patch[0].ring[3].edge_level = 0.5f*ring[3].edge_level; + + patch[1].ring[0].edge_level = 0.5f*ring[0].edge_level; + patch[1].ring[1].edge_level = 0.5f*ring[1].edge_level; + patch[1].ring[2].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level); + patch[1].ring[3].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level); + + patch[2].ring[0].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level); + patch[2].ring[1].edge_level = 0.5f*ring[1].edge_level; + patch[2].ring[2].edge_level = 0.5f*ring[2].edge_level; + patch[2].ring[3].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level); + + patch[3].ring[0].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level); + patch[3].ring[1].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level); + patch[3].ring[2].edge_level = 0.5f*ring[2].edge_level; + patch[3].ring[3].edge_level = 0.5f*ring[3].edge_level; + + const bool regular0 = ring[0].has_last_face() && ring[1].face_valence > 2; + if (likely(regular0)) + init_regular(patch[0].ring[0],patch[1].ring[1],patch[0].ring[1],patch[1].ring[0]); + else + init_border(patch[0].ring[0],patch[1].ring[1],patch[0].ring[1],patch[1].ring[0]); + + const bool regular1 = ring[1].has_last_face() && ring[2].face_valence > 2; + if (likely(regular1)) + init_regular(patch[1].ring[1],patch[2].ring[2],patch[1].ring[2],patch[2].ring[1]); + else + init_border(patch[1].ring[1],patch[2].ring[2],patch[1].ring[2],patch[2].ring[1]); + + const bool regular2 = ring[2].has_last_face() && ring[3].face_valence > 2; + if (likely(regular2)) + init_regular(patch[2].ring[2],patch[3].ring[3],patch[2].ring[3],patch[3].ring[2]); + else + init_border(patch[2].ring[2],patch[3].ring[3],patch[2].ring[3],patch[3].ring[2]); + + const bool regular3 = ring[3].has_last_face() && ring[0].face_valence > 2; + if (likely(regular3)) + init_regular(patch[3].ring[3],patch[0].ring[0],patch[3].ring[0],patch[0].ring[3]); + else + init_border(patch[3].ring[3],patch[0].ring[0],patch[3].ring[0],patch[0].ring[3]); + + Vertex_t center = (ring[0].vtx + ring[1].vtx + ring[2].vtx + ring[3].vtx) * 0.25f; + + Vertex_t center_ring[8]; + center_ring[0] = (Vertex_t)patch[3].ring[3].ring[0]; + center_ring[7] = (Vertex_t)patch[3].ring[3].vtx; + center_ring[6] = (Vertex_t)patch[2].ring[2].ring[0]; + center_ring[5] = (Vertex_t)patch[2].ring[2].vtx; + center_ring[4] = (Vertex_t)patch[1].ring[1].ring[0]; + center_ring[3] = (Vertex_t)patch[1].ring[1].vtx; + center_ring[2] = (Vertex_t)patch[0].ring[0].ring[0]; + center_ring[1] = (Vertex_t)patch[0].ring[0].vtx; + + init_regular(center,center_ring,0,patch[0].ring[2]); + init_regular(center,center_ring,2,patch[1].ring[3]); + init_regular(center,center_ring,4,patch[2].ring[0]); + init_regular(center,center_ring,6,patch[3].ring[1]); + + assert(patch[0].verify()); + assert(patch[1].verify()); + assert(patch[2].verify()); + assert(patch[3].verify()); + } + + bool verify() const { + return ring[0].hasValidPositions() && ring[1].hasValidPositions() && ring[2].hasValidPositions() && ring[3].hasValidPositions(); + } + + __forceinline void init( FinalQuad& quad ) const + { + quad.vtx[0] = (Vertex_t)ring[0].vtx; + quad.vtx[1] = (Vertex_t)ring[1].vtx; + quad.vtx[2] = (Vertex_t)ring[2].vtx; + quad.vtx[3] = (Vertex_t)ring[3].vtx; + }; + + friend __forceinline embree_ostream operator<<(embree_ostream o, const CatmullClarkPatchT &p) + { + o << "CatmullClarkPatch { " << embree_endl; + for (size_t i=0; i<4; i++) + o << "ring" << i << ": " << p.ring[i] << embree_endl; + o << "}" << embree_endl; + return o; + } + }; + + typedef CatmullClarkPatchT CatmullClarkPatch3fa; + + template + class __aligned(64) GeneralCatmullClarkPatchT + { + public: + typedef CatmullClarkPatchT CatmullClarkPatch; + typedef CatmullClark1RingT CatmullClark1Ring; + typedef BezierCurveT BezierCurve; + + static const unsigned SIZE = MAX_PATCH_VALENCE; + DynamicStackArray,8,SIZE> ring; + unsigned N; + + __forceinline GeneralCatmullClarkPatchT () + : N(0) {} + + GeneralCatmullClarkPatchT (const HalfEdge* h, const char* vertices, size_t stride) { + init(h,vertices,stride); + } + + __forceinline GeneralCatmullClarkPatchT (const HalfEdge* first_half_edge, const BufferView& vertices) { + init(first_half_edge,vertices.getPtr(),vertices.getStride()); + } + + __forceinline void init (const HalfEdge* h, const char* vertices, size_t stride) + { + unsigned int i = 0; + const HalfEdge* edge = h; + do { + ring[i].init(edge,vertices,stride); + edge = edge->next(); + i++; + } while ((edge != h) && (i < SIZE)); + N = i; + } + + __forceinline unsigned size() const { + return N; + } + + __forceinline bool isQuadPatch() const { + return (N == 4) && ring[0].only_quads && ring[1].only_quads && ring[2].only_quads && ring[3].only_quads; + } + + static __forceinline void init_regular(const CatmullClark1RingT& p0, + const CatmullClark1RingT& p1, + CatmullClark1RingT& dest0, + CatmullClark1RingT& dest1) + { + assert(p1.face_valence > 2); + dest1.vertex_level = dest0.vertex_level = p0.edge_level; + dest1.face_valence = dest0.face_valence = 4; + dest1.edge_valence = dest0.edge_valence = 8; + dest1.border_index = dest0.border_index = -1; + dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0]; + dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f; + + dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1]; + dest1.ring[1] = dest0.ring[7] = (Vertex_t)p1.ring[0]; + dest1.ring[0] = dest0.ring[6] = (Vertex_t)p1.vtx; + dest1.ring[7] = dest0.ring[5] = (Vertex_t)p1.ring[4]; + dest1.ring[6] = dest0.ring[4] = (Vertex_t)p0.ring[p0.edge_valence-1]; + dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.edge_valence-2]; + dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx; + dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2]; + + dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f; + dest1.crease_weight[0] = dest0.crease_weight[3] = p1.crease_weight[1]; + dest1.crease_weight[3] = dest0.crease_weight[2] = 0.0f; + dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0]; + + if (p0.eval_unique_identifier <= p1.eval_unique_identifier) + { + dest0.eval_start_index = 3; + dest1.eval_start_index = 0; + dest0.eval_unique_identifier = p0.eval_unique_identifier; + dest1.eval_unique_identifier = p0.eval_unique_identifier; + } + else + { + dest0.eval_start_index = 1; + dest1.eval_start_index = 2; + dest0.eval_unique_identifier = p1.eval_unique_identifier; + dest1.eval_unique_identifier = p1.eval_unique_identifier; + } + } + + + static __forceinline void init_border(const CatmullClark1RingT &p0, + const CatmullClark1RingT &p1, + CatmullClark1RingT &dest0, + CatmullClark1RingT &dest1) + { + dest1.vertex_level = dest0.vertex_level = p0.edge_level; + dest1.face_valence = dest0.face_valence = 3; + dest1.edge_valence = dest0.edge_valence = 6; + dest0.border_index = 2; + dest1.border_index = 4; + dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0]; + dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f; + + dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1]; + dest1.ring[1] = dest0.ring[5] = (Vertex_t)p1.ring[0]; + dest1.ring[0] = dest0.ring[4] = (Vertex_t)p1.vtx; + dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.border_index+1]; // dummy + dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx; + dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2]; + + dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f; + dest1.crease_weight[0] = dest0.crease_weight[2] = p1.crease_weight[1]; + dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0]; + + if (p0.eval_unique_identifier <= p1.eval_unique_identifier) + { + dest0.eval_start_index = 1; + dest1.eval_start_index = 2; + dest0.eval_unique_identifier = p0.eval_unique_identifier; + dest1.eval_unique_identifier = p0.eval_unique_identifier; + } + else + { + dest0.eval_start_index = 2; + dest1.eval_start_index = 0; + dest0.eval_unique_identifier = p1.eval_unique_identifier; + dest1.eval_unique_identifier = p1.eval_unique_identifier; + } + } + + static __forceinline void init_regular(const Vertex_t ¢er, const array_t& center_ring, const float vertex_level, const unsigned int N, const unsigned int offset, CatmullClark1RingT &dest) + { + assert(N<(MAX_RING_FACE_VALENCE)); + assert(2*N<(MAX_RING_EDGE_VALENCE)); + dest.vertex_level = vertex_level; + dest.face_valence = N; + dest.edge_valence = 2*N; + dest.border_index = -1; + dest.vtx = (Vertex_t)center; + dest.vertex_crease_weight = 0.0f; + for (unsigned i=0; i<2*N; i++) { + dest.ring[i] = (Vertex_t)center_ring[(2*N+offset+i-1)%(2*N)]; + assert(isvalid(dest.ring[i])); + } + for (unsigned i=0; i>1; + if (dest.eval_start_index >= dest.face_valence) dest.eval_start_index -= dest.face_valence; + + assert( dest.eval_start_index < dest.face_valence ); + dest.eval_unique_identifier = 0; + } + + __noinline void subdivide(array_t& patch, unsigned& N_o) const + { + N_o = N; + assert( N ); + for (unsigned i=0; i center_ring; + float center_vertex_level = 2.0f; // guarantees that irregular vertices get always isolated also for non-quads + + for (unsigned i=0; i 2; + if (likely(regular)) init_regular(patch[i].ring[0],patch[ip1].ring[0],patch[i].ring[1],patch[ip1].ring[3]); + else init_border (patch[i].ring[0],patch[ip1].ring[0],patch[i].ring[1],patch[ip1].ring[3]); + + assert( patch[i].ring[1].hasValidPositions() ); + assert( patch[ip1].ring[3].hasValidPositions() ); + + float level = 0.25f*(ring[im1].edge_level+ring[ip1].edge_level); + patch[i].ring[1].edge_level = patch[ip1].ring[2].edge_level = level; + center_vertex_level = max(center_vertex_level,level); + + center += ring[i].vtx; + center_ring[2*i+0] = (Vertex_t)patch[i].ring[0].vtx; + center_ring[2*i+1] = (Vertex_t)patch[i].ring[0].ring[0]; + } + center /= float(N); + + for (unsigned int i=0; i& patches) + { + CatmullClark1Ring patches1ring1 = patches[1].ring[1]; + patches[1].ring[1] = patches[1].ring[0]; // FIXME: optimize these assignments + patches[1].ring[0] = patches[1].ring[3]; + patches[1].ring[3] = patches[1].ring[2]; + patches[1].ring[2] = patches1ring1; + + CatmullClark1Ring patches2ring2 = patches[2].ring[2]; + patches[2].ring[2] = patches[2].ring[0]; + patches[2].ring[0] = patches2ring2; + CatmullClark1Ring patches2ring3 = patches[2].ring[3]; + patches[2].ring[3] = patches[2].ring[1]; + patches[2].ring[1] = patches2ring3; + + CatmullClark1Ring patches3ring3 = patches[3].ring[3]; + patches[3].ring[3] = patches[3].ring[0]; + patches[3].ring[0] = patches[3].ring[1]; + patches[3].ring[1] = patches[3].ring[2]; + patches[3].ring[2] = patches3ring3; + } + + __forceinline void getLimitBorder(BezierCurve curves[GeneralCatmullClarkPatchT::SIZE]) const + { + Vertex P0 = ring[0].getLimitVertex(); + for (unsigned i=0; i GeneralCatmullClarkPatch3fa; +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_ring.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_ring.h new file mode 100644 index 00000000000..73b41fd4ff4 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_ring.h @@ -0,0 +1,826 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/geometry.h" +#include "../common/buffer.h" +#include "half_edge.h" +#include "catmullclark_coefficients.h" + +namespace embree +{ + struct __aligned(64) FinalQuad { + Vec3fa vtx[4]; + }; + + template + struct __aligned(64) CatmullClark1RingT + { + ALIGNED_STRUCT_(64); + + int border_index; //!< edge index where border starts + unsigned int face_valence; //!< number of adjacent quad faces + unsigned int edge_valence; //!< number of adjacent edges (2*face_valence) + float vertex_crease_weight; //!< weight of vertex crease (0 if no vertex crease) + DynamicStackArray crease_weight; //!< edge crease weights for each adjacent edge + float vertex_level; //!< maximum level of all adjacent edges + float edge_level; //!< level of first edge + unsigned int eval_start_index; //!< topology dependent index to start evaluation + unsigned int eval_unique_identifier; //!< topology dependent unique identifier for this ring + Vertex vtx; //!< center vertex + DynamicStackArray ring; //!< ring of neighboring vertices + + public: + CatmullClark1RingT () + : eval_start_index(0), eval_unique_identifier(0) {} // FIXME: default constructor should be empty + + /*! calculates number of bytes required to serialize this structure */ + __forceinline size_t bytes() const + { + size_t ofs = 0; + ofs += sizeof(border_index); + ofs += sizeof(face_valence); + assert(2*face_valence == edge_valence); + ofs += sizeof(vertex_crease_weight); + ofs += face_valence*sizeof(float); + ofs += sizeof(vertex_level); + ofs += sizeof(edge_level); + ofs += sizeof(eval_start_index); + ofs += sizeof(eval_unique_identifier); + ofs += sizeof(vtx); + ofs += edge_valence*sizeof(Vertex); + return ofs; + } + + template + static __forceinline void store(char* ptr, size_t& ofs, const Ty& v) { + *(Ty*)&ptr[ofs] = v; ofs += sizeof(Ty); + } + + template + static __forceinline void load(char* ptr, size_t& ofs, Ty& v) { + v = *(Ty*)&ptr[ofs]; ofs += sizeof(Ty); + } + + /*! serializes the ring to some memory location */ + __forceinline void serialize(char* ptr, size_t& ofs) const + { + store(ptr,ofs,border_index); + store(ptr,ofs,face_valence); + store(ptr,ofs,vertex_crease_weight); + for (size_t i=0; ii); + return ring[i]; + } + + __forceinline const Vertex& back(size_t i) const { + assert(edge_valence>=i); + return ring[edge_valence-i]; + } + + __forceinline bool has_last_face() const { + return (size_t)border_index != (size_t)edge_valence-2; + } + + __forceinline bool has_opposite_front(size_t i) const { + return (size_t)border_index != 2*i; + } + + __forceinline bool has_opposite_back(size_t i) const { + return (size_t)border_index != ((size_t)edge_valence-2-2*i); + } + + __forceinline BBox3fa bounds() const + { + BBox3fa bounds ( vtx ); + for (size_t i = 0; igetStartVertexIndex()*stride); + vertex_crease_weight = h->vertex_crease_weight; + + HalfEdge* p = (HalfEdge*) h; + + unsigned i=0; + unsigned min_vertex_index = (unsigned)-1; + unsigned min_vertex_index_face = (unsigned)-1; + edge_level = p->edge_level; + vertex_level = 0.0f; + + do + { + vertex_level = max(vertex_level,p->edge_level); + crease_weight[i/2] = p->edge_crease_weight; + assert(p->hasOpposite() || p->edge_crease_weight == float(inf)); + + /* store first two vertices of face */ + p = p->next(); + const unsigned index0 = p->getStartVertexIndex(); + ring[i++] = Vertex_t::loadu(vertices+index0*stride); + if (index0 < min_vertex_index) { min_vertex_index = index0; min_vertex_index_face = i>>1; } + p = p->next(); + + const unsigned index1 = p->getStartVertexIndex(); + ring[i++] = Vertex_t::loadu(vertices+index1*stride); + p = p->next(); + + /* continue with next face */ + if (likely(p->hasOpposite())) + p = p->opposite(); + + /* if there is no opposite go the long way to the other side of the border */ + else + { + /* find minimum start vertex */ + const unsigned index0 = p->getStartVertexIndex(); + if (index0 < min_vertex_index) { min_vertex_index = index0; min_vertex_index_face = i>>1; } + + /*! mark first border edge and store dummy vertex for face between the two border edges */ + border_index = i; + crease_weight[i/2] = inf; + ring[i++] = Vertex_t::loadu(vertices+index0*stride); + ring[i++] = vtx; // dummy vertex + + /*! goto other side of border */ + p = (HalfEdge*) h; + while (p->hasOpposite()) + p = p->opposite()->next(); + } + + } while (p != h); + + edge_valence = i; + face_valence = i >> 1; + eval_unique_identifier = min_vertex_index; + eval_start_index = min_vertex_index_face; + + assert( hasValidPositions() ); + } + + __forceinline void subdivide(CatmullClark1RingT& dest) const + { + dest.edge_level = 0.5f*edge_level; + dest.vertex_level = 0.5f*vertex_level; + dest.face_valence = face_valence; + dest.edge_valence = edge_valence; + dest.border_index = border_index; + dest.vertex_crease_weight = max(0.0f,vertex_crease_weight-1.0f); + dest.eval_start_index = eval_start_index; + dest.eval_unique_identifier = eval_unique_identifier; + + /* calculate face points */ + Vertex_t S = Vertex_t(0.0f); + for (size_t i=0; i= face_valence) face_index -= face_valence; assert(face_index < face_valence); + size_t index0 = 2*face_index+0; if (index0 >= edge_valence) index0 -= edge_valence; assert(index0 < edge_valence); + size_t index1 = 2*face_index+1; if (index1 >= edge_valence) index1 -= edge_valence; assert(index1 < edge_valence); + size_t index2 = 2*face_index+2; if (index2 >= edge_valence) index2 -= edge_valence; assert(index2 < edge_valence); + S += dest.ring[index1] = ((vtx + ring[index1]) + (ring[index0] + ring[index2])) * 0.25f; + } + + /* calculate new edge points */ + size_t num_creases = 0; + array_t crease_id; + + for (size_t i=0; i= face_valence) face_index -= face_valence; + const float edge_crease = crease_weight[face_index]; + dest.crease_weight[face_index] = max(edge_crease-1.0f,0.0f); + + size_t index = 2*face_index; + size_t prev_index = face_index == 0 ? edge_valence-1 : 2*face_index-1; + size_t next_index = 2*face_index+1; + + const Vertex_t v = vtx + ring[index]; + const Vertex_t f = dest.ring[prev_index] + dest.ring[next_index]; + S += ring[index]; + + /* fast path for regular edge points */ + if (likely(edge_crease <= 0.0f)) { + dest.ring[index] = (v+f) * 0.25f; + } + + /* slower path for hard edge rule */ + else { + crease_id[num_creases++] = face_index; + dest.ring[index] = v*0.5f; + + /* even slower path for blended edge rule */ + if (unlikely(edge_crease < 1.0f)) { + dest.ring[index] = lerp((v+f)*0.25f,v*0.5f,edge_crease); + } + } + } + + /* compute new vertex using smooth rule */ + const float inv_face_valence = 1.0f / (float)face_valence; + const Vertex_t v_smooth = (Vertex_t) madd(inv_face_valence,S,(float(face_valence)-2.0f)*vtx)*inv_face_valence; + dest.vtx = v_smooth; + + /* compute new vertex using vertex_crease_weight rule */ + if (unlikely(vertex_crease_weight > 0.0f)) + { + if (vertex_crease_weight >= 1.0f) { + dest.vtx = vtx; + } else { + dest.vtx = lerp(v_smooth,vtx,vertex_crease_weight); + } + return; + } + + /* no edge crease rule and dart rule */ + if (likely(num_creases <= 1)) + return; + + /* compute new vertex using crease rule */ + if (likely(num_creases == 2)) + { + /* update vertex using crease rule */ + const size_t crease0 = crease_id[0], crease1 = crease_id[1]; + const Vertex_t v_sharp = (Vertex_t)(ring[2*crease0] + 6.0f*vtx + ring[2*crease1]) * (1.0f / 8.0f); + dest.vtx = v_sharp; + + /* update crease_weights using chaikin rule */ + const float crease_weight0 = crease_weight[crease0], crease_weight1 = crease_weight[crease1]; + dest.crease_weight[crease0] = max(0.25f*(3.0f*crease_weight0 + crease_weight1)-1.0f,0.0f); + dest.crease_weight[crease1] = max(0.25f*(3.0f*crease_weight1 + crease_weight0)-1.0f,0.0f); + + /* interpolate between sharp and smooth rule */ + const float v_blend = 0.5f*(crease_weight0+crease_weight1); + if (unlikely(v_blend < 1.0f)) { + dest.vtx = lerp(v_smooth,v_sharp,v_blend); + } + } + + /* compute new vertex using corner rule */ + else { + dest.vtx = vtx; + } + } + + __forceinline bool isRegular1() const + { + if (border_index == -1) { + if (face_valence == 4) return true; + } else { + if (face_valence < 4) return true; + } + return false; + } + + __forceinline size_t numEdgeCreases() const + { + ssize_t numCreases = 0; + for (size_t i=0; i 0.0f; + } + return numCreases; + } + + enum Type { + TYPE_NONE = 0, //!< invalid type + TYPE_REGULAR = 1, //!< regular patch when ignoring creases + TYPE_REGULAR_CREASES = 2, //!< regular patch when considering creases + TYPE_GREGORY = 4, //!< gregory patch when ignoring creases + TYPE_GREGORY_CREASES = 8, //!< gregory patch when considering creases + TYPE_CREASES = 16 //!< patch has crease features + }; + + __forceinline Type type() const + { + /* check if there is an edge crease anywhere */ + const size_t numCreases = numEdgeCreases(); + const bool noInnerCreases = hasBorder() ? numCreases == 2 : numCreases == 0; + + Type crease_mask = (Type) (TYPE_REGULAR | TYPE_GREGORY); + if (noInnerCreases ) crease_mask = (Type) (crease_mask | TYPE_REGULAR_CREASES | TYPE_GREGORY_CREASES); + if (numCreases != 0) crease_mask = (Type) (crease_mask | TYPE_CREASES); + + /* calculate if this vertex is regular */ + bool hasBorder = border_index != -1; + if (face_valence == 2 && hasBorder) { + if (vertex_crease_weight == 0.0f ) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); + else if (vertex_crease_weight == float(inf)) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); + else return TYPE_CREASES; + } + else if (vertex_crease_weight != 0.0f) return TYPE_CREASES; + else if (face_valence == 3 && hasBorder) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); + else if (face_valence == 4 && !hasBorder) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); + else return (Type) (crease_mask & (TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES)); + } + + __forceinline bool isFinalResolution(float res) const { + return vertex_level <= res; + } + + /* computes the limit vertex */ + __forceinline Vertex getLimitVertex() const + { + /* return hard corner */ + if (unlikely(std::isinf(vertex_crease_weight))) + return vtx; + + /* border vertex rule */ + if (unlikely(border_index != -1)) + { + const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2; + return (4.0f * vtx + (ring[border_index] + ring[second_border_index])) * 1.0f/6.0f; + } + + Vertex_t F( 0.0f ); + Vertex_t E( 0.0f ); + + assert(eval_start_index < face_valence); + + for (size_t i=0; i= face_valence) index -= face_valence; + F += ring[2*index+1]; + E += ring[2*index]; + } + + const float n = (float)face_valence; + return (Vertex_t)(n*n*vtx+4.0f*E+F) / ((n+5.0f)*n); + } + + /* gets limit tangent in the direction of egde vtx -> ring[0] */ + __forceinline Vertex getLimitTangent() const + { + if (unlikely(std::isinf(vertex_crease_weight))) + return ring[0] - vtx; + + /* border vertex rule */ + if (unlikely(border_index != -1)) + { + if (border_index != (int)edge_valence-2 ) { + return ring[0] - vtx; + } + else + { + const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2; + return (ring[second_border_index] - ring[border_index]) * 0.5f; + } + } + + Vertex_t alpha( 0.0f ); + Vertex_t beta ( 0.0f ); + + const size_t n = face_valence; + + assert(eval_start_index < face_valence); + + Vertex_t q( 0.0f ); + for (size_t i=0; i= face_valence) index -= face_valence; + const float a = CatmullClarkPrecomputedCoefficients::table.limittangent_a(index,n); + const float b = CatmullClarkPrecomputedCoefficients::table.limittangent_b(index,n); + alpha += a * ring[2*index]; + beta += b * ring[2*index+1]; + } + + const float sigma = CatmullClarkPrecomputedCoefficients::table.limittangent_c(n); + return sigma * (alpha + beta); + } + + /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */ + __forceinline Vertex getSecondLimitTangent() const + { + if (unlikely(std::isinf(vertex_crease_weight))) + return ring[2] - vtx; + + /* border vertex rule */ + if (unlikely(border_index != -1)) + { + if (border_index != 2) { + return ring[2] - vtx; + } + else { + const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2; + return (ring[border_index] - ring[second_border_index]) * 0.5f; + } + } + + Vertex_t alpha( 0.0f ); + Vertex_t beta ( 0.0f ); + + const size_t n = face_valence; + + assert(eval_start_index < face_valence); + + for (size_t i=0; i= face_valence) index -= face_valence; + + size_t prev_index = index == 0 ? face_valence-1 : index-1; // need to be bit-wise exact in cosf eval + const float a = CatmullClarkPrecomputedCoefficients::table.limittangent_a(prev_index,n); + const float b = CatmullClarkPrecomputedCoefficients::table.limittangent_b(prev_index,n); + alpha += a * ring[2*index]; + beta += b * ring[2*index+1]; + } + + const float sigma = CatmullClarkPrecomputedCoefficients::table.limittangent_c(n); + return sigma* (alpha + beta); + } + + /* gets surface normal */ + const Vertex getNormal() const { + return cross(getLimitTangent(),getSecondLimitTangent()); + } + + /* returns center of the n-th quad in the 1-ring */ + __forceinline Vertex getQuadCenter(const size_t index) const + { + const Vertex_t &p0 = vtx; + const Vertex_t &p1 = ring[2*index+0]; + const Vertex_t &p2 = ring[2*index+1]; + const Vertex_t &p3 = index == face_valence-1 ? ring[0] : ring[2*index+2]; + const Vertex p = (p0+p1+p2+p3) * 0.25f; + return p; + } + + /* returns center of the n-th edge in the 1-ring */ + __forceinline Vertex getEdgeCenter(const size_t index) const { + return (vtx + ring[index*2]) * 0.5f; + } + + bool hasValidPositions() const + { + for (size_t i=0; i " << c.ring[i]; + if (i % 2 == 0) o << " crease = " << c.crease_weight[i/2]; + o << embree_endl; + } + return o; + } + }; + + typedef CatmullClark1RingT CatmullClark1Ring3fa; + + template + struct __aligned(64) GeneralCatmullClark1RingT + { + ALIGNED_STRUCT_(64); + + typedef CatmullClark1RingT CatmullClark1Ring; + + struct Face + { + __forceinline Face() {} + __forceinline Face (int size, float crease_weight) + : size(size), crease_weight(crease_weight) {} + + // FIXME: add member that returns total number of vertices + + int size; // number of vertices-2 of nth face in ring + float crease_weight; + }; + + Vertex vtx; + DynamicStackArray ring; + DynamicStackArray faces; + unsigned int face_valence; + unsigned int edge_valence; + int border_face; + float vertex_crease_weight; + float vertex_level; //!< maximum level of adjacent edges + float edge_level; // level of first edge + bool only_quads; // true if all faces are quads + unsigned int eval_start_face_index; + unsigned int eval_start_vertex_index; + unsigned int eval_unique_identifier; + + public: + GeneralCatmullClark1RingT() + : eval_start_face_index(0), eval_start_vertex_index(0), eval_unique_identifier(0) {} + + __forceinline bool isRegular() const + { + if (border_face == -1 && face_valence == 4) return true; + return false; + } + + __forceinline bool has_last_face() const { + return border_face != (int)face_valence-1; + } + + __forceinline bool has_second_face() const { + return (border_face == -1) || (border_face >= 2); + } + + bool hasValidPositions() const + { + for (size_t i=0; igetStartVertexIndex()*stride); + vertex_crease_weight = h->vertex_crease_weight; + HalfEdge* p = (HalfEdge*) h; + + unsigned int e=0, f=0; + unsigned min_vertex_index = (unsigned)-1; + unsigned min_vertex_index_face = (unsigned)-1; + unsigned min_vertex_index_vertex = (unsigned)-1; + edge_level = p->edge_level; + vertex_level = 0.0f; + do + { + HalfEdge* p_prev = p->prev(); + HalfEdge* p_next = p->next(); + const float crease_weight = p->edge_crease_weight; + assert(p->hasOpposite() || p->edge_crease_weight == float(inf)); + vertex_level = max(vertex_level,p->edge_level); + + /* find minimum start vertex */ + unsigned vertex_index = p_next->getStartVertexIndex(); + if (vertex_index < min_vertex_index) { min_vertex_index = vertex_index; min_vertex_index_face = f; min_vertex_index_vertex = e; } + + /* store first N-2 vertices of face */ + unsigned int vn = 0; + for (p = p_next; p!=p_prev; p=p->next()) { + ring[e++] = Vertex_t::loadu(vertices+p->getStartVertexIndex()*stride); + vn++; + } + faces[f++] = Face(vn,crease_weight); + only_quads &= (vn == 2); + + /* continue with next face */ + if (likely(p->hasOpposite())) + p = p->opposite(); + + /* if there is no opposite go the long way to the other side of the border */ + else + { + /* find minimum start vertex */ + unsigned vertex_index = p->getStartVertexIndex(); + if (vertex_index < min_vertex_index) { min_vertex_index = vertex_index; min_vertex_index_face = f; min_vertex_index_vertex = e; } + + /*! mark first border edge and store dummy vertex for face between the two border edges */ + border_face = f; + faces[f++] = Face(2,inf); + ring[e++] = Vertex_t::loadu(vertices+p->getStartVertexIndex()*stride); + ring[e++] = vtx; // dummy vertex + + /*! goto other side of border */ + p = (HalfEdge*) h; + while (p->hasOpposite()) + p = p->opposite()->next(); + } + + } while (p != h); + + edge_valence = e; + face_valence = f; + eval_unique_identifier = min_vertex_index; + eval_start_face_index = min_vertex_index_face; + eval_start_vertex_index = min_vertex_index_vertex; + + assert( hasValidPositions() ); + } + + __forceinline void subdivide(CatmullClark1Ring& dest) const + { + dest.edge_level = 0.5f*edge_level; + dest.vertex_level = 0.5f*vertex_level; + dest.face_valence = face_valence; + dest.edge_valence = 2*face_valence; + dest.border_index = border_face == -1 ? -1 : 2*border_face; // FIXME: + dest.vertex_crease_weight = max(0.0f,vertex_crease_weight-1.0f); + dest.eval_start_index = eval_start_face_index; + dest.eval_unique_identifier = eval_unique_identifier; + assert(dest.face_valence <= MAX_RING_FACE_VALENCE); + + /* calculate face points */ + Vertex_t S = Vertex_t(0.0f); + for (size_t face=0, v=eval_start_vertex_index; face crease_id; + Vertex_t C = Vertex_t(0.0f); + for (size_t face=0, j=eval_start_vertex_index; face 0.0f)) + { + if (vertex_crease_weight >= 1.0f) { + dest.vtx = vtx; + } else { + dest.vtx = lerp(vtx,v_smooth,vertex_crease_weight); + } + return; + } + + if (likely(num_creases <= 1)) + return; + + /* compute new vertex using crease rule */ + if (likely(num_creases == 2)) { + const Vertex_t v_sharp = (Vertex_t)(C + 6.0f * vtx) * (1.0f / 8.0f); + const float crease_weight0 = faces[crease_id[0]].crease_weight; + const float crease_weight1 = faces[crease_id[1]].crease_weight; + dest.vtx = v_sharp; + dest.crease_weight[crease_id[0]] = max(0.25f*(3.0f*crease_weight0 + crease_weight1)-1.0f,0.0f); + dest.crease_weight[crease_id[1]] = max(0.25f*(3.0f*crease_weight1 + crease_weight0)-1.0f,0.0f); + const float v_blend = 0.5f*(crease_weight0+crease_weight1); + if (unlikely(v_blend < 1.0f)) { + dest.vtx = lerp(v_sharp,v_smooth,v_blend); + } + } + + /* compute new vertex using corner rule */ + else { + dest.vtx = vtx; + } + } + + void convert(CatmullClark1Ring& dst) const + { + dst.edge_level = edge_level; + dst.vertex_level = vertex_level; + dst.vtx = vtx; + dst.face_valence = face_valence; + dst.edge_valence = 2*face_valence; + dst.border_index = border_face == -1 ? -1 : 2*border_face; + for (size_t i=0; i ring[0] */ + __forceinline Vertex getLimitTangent() const + { + CatmullClark1Ring cc_vtx; + + /* fast path for quad only rings */ + if (only_quads) + { + convert(cc_vtx); + return cc_vtx.getLimitTangent(); + } + + subdivide(cc_vtx); + return 2.0f * cc_vtx.getLimitTangent(); + } + + /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */ + __forceinline Vertex getSecondLimitTangent() const + { + CatmullClark1Ring cc_vtx; + + /* fast path for quad only rings */ + if (only_quads) + { + convert(cc_vtx); + return cc_vtx.getSecondLimitTangent(); + } + + subdivide(cc_vtx); + return 2.0f * cc_vtx.getSecondLimitTangent(); + } + + + /* gets limit vertex */ + __forceinline Vertex getLimitVertex() const + { + CatmullClark1Ring cc_vtx; + + /* fast path for quad only rings */ + if (only_quads) + convert(cc_vtx); + else + subdivide(cc_vtx); + return cc_vtx.getLimitVertex(); + } + + friend __forceinline embree_ostream operator<<(embree_ostream o, const GeneralCatmullClark1RingT &c) + { + o << "vtx " << c.vtx << " size = " << c.edge_valence << ", border_face = " << c.border_face << ", " << " face_valence = " << c.face_valence << + ", edge_level = " << c.edge_level << ", vertex_level = " << c.vertex_level << ", ring: " << embree_endl; + for (size_t v=0, f=0; f " << c.ring[i]; + if (i == v) o << " crease = " << c.faces[f].crease_weight; + o << embree_endl; + } + } + return o; + } + }; +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullrom_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullrom_curve.h new file mode 100644 index 00000000000..b244af481cb --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/catmullrom_curve.h @@ -0,0 +1,296 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "../common/scene_curves.h" + +/* + + Implements Catmul Rom curves with control points p0, p1, p2, p3. At + t=0 the curve goes through p1, with tangent (p2-p0)/3, and for t=1 + the curve goes through p2 with tangent (p3-p2)/2. + + */ + +namespace embree +{ + class CatmullRomBasis + { + public: + + template + static __forceinline Vec4 eval(const T& u) + { + const T t = u; + const T s = T(1.0f) - u; + const T n0 = - t * s * s; + const T n1 = 2.0f + t * t * (3.0f * t - 5.0f); + const T n2 = 2.0f + s * s * (3.0f * s - 5.0f); + const T n3 = - s * t * t; + return T(0.5f) * Vec4(n0, n1, n2, n3); + } + + template + static __forceinline Vec4 derivative(const T& u) + { + const T t = u; + const T s = 1.0f - u; + const T n0 = - s * s + 2.0f * s * t; + const T n1 = 2.0f * t * (3.0f * t - 5.0f) + 3.0f * t * t; + const T n2 = 2.0f * s * (3.0f * t + 2.0f) - 3.0f * s * s; + const T n3 = -2.0f * s * t + t * t; + return T(0.5f) * Vec4(n0, n1, n2, n3); + } + + template + static __forceinline Vec4 derivative2(const T& u) + { + const T t = u; + const T n0 = -3.0f * t + 2.0f; + const T n1 = 9.0f * t - 5.0f; + const T n2 = -9.0f * t + 4.0f; + const T n3 = 3.0f * t - 1.0f; + return Vec4(n0, n1, n2, n3); + } + }; + + struct PrecomputedCatmullRomBasis + { + enum { N = 16 }; + public: + PrecomputedCatmullRomBasis() {} + PrecomputedCatmullRomBasis(int shift); + + /* basis for bspline evaluation */ + public: + float c0[N+1][N+1]; + float c1[N+1][N+1]; + float c2[N+1][N+1]; + float c3[N+1][N+1]; + + /* basis for bspline derivative evaluation */ + public: + float d0[N+1][N+1]; + float d1[N+1][N+1]; + float d2[N+1][N+1]; + float d3[N+1][N+1]; + }; + extern PrecomputedCatmullRomBasis catmullrom_basis0; + extern PrecomputedCatmullRomBasis catmullrom_basis1; + + template + struct CatmullRomCurveT + { + Vertex v0,v1,v2,v3; + + __forceinline CatmullRomCurveT() {} + + __forceinline CatmullRomCurveT(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3) + : v0(v0), v1(v1), v2(v2), v3(v3) {} + + __forceinline Vertex begin() const { + return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2)); + } + + __forceinline Vertex end() const { + return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3)); + } + + __forceinline Vertex center() const { + return 0.25f*(v0+v1+v2+v3); + } + + __forceinline BBox bounds() const { + return merge(BBox(v0),BBox(v1),BBox(v2),BBox(v3)); + } + + __forceinline friend CatmullRomCurveT operator -( const CatmullRomCurveT& a, const Vertex& b ) { + return CatmullRomCurveT(a.v0-b,a.v1-b,a.v2-b,a.v3-b); + } + + __forceinline CatmullRomCurveT xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const + { + const Vec3ff q0(xfmVector(space,v0-p), v0.w); + const Vec3ff q1(xfmVector(space,v1-p), v1.w); + const Vec3ff q2(xfmVector(space,v2-p), v2.w); + const Vec3ff q3(xfmVector(space,v3-p), v3.w); + return CatmullRomCurveT(q0,q1,q2,q3); + } + + __forceinline Vertex eval(const float t) const + { + const Vec4 b = CatmullRomBasis::eval(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline Vertex eval_du(const float t) const + { + const Vec4 b = CatmullRomBasis::derivative(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline Vertex eval_dudu(const float t) const + { + const Vec4 b = CatmullRomBasis::derivative2(t); + return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3))); + } + + __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const + { + p = eval(t); + dp = eval_du(t); + ddp = eval_dudu(t); + } + + template + __forceinline Vec4vf veval(const vfloat& t) const + { + const Vec4vf b = CatmullRomBasis::eval(t); + return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); + } + + template + __forceinline Vec4vf veval_du(const vfloat& t) const + { + const Vec4vf b = CatmullRomBasis::derivative(t); + return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); + } + + template + __forceinline Vec4vf veval_dudu(const vfloat& t) const + { + const Vec4vf b = CatmullRomBasis::derivative2(t); + return madd(b.x, Vec4vf(v0), madd(b.y, Vec4vf(v1), madd(b.z, Vec4vf(v2), b.w * Vec4vf(v3)))); + } + + template + __forceinline void veval(const vfloat& t, Vec4vf& p, Vec4vf& dp) const + { + p = veval(t); + dp = veval_du(t); + } + + template + __forceinline Vec4vf eval0(const int ofs, const int size) const + { + assert(size <= PrecomputedCatmullRomBasis::N); + assert(ofs <= size); + return madd(vfloat::loadu(&catmullrom_basis0.c0[size][ofs]), Vec4vf(v0), + madd(vfloat::loadu(&catmullrom_basis0.c1[size][ofs]), Vec4vf(v1), + madd(vfloat::loadu(&catmullrom_basis0.c2[size][ofs]), Vec4vf(v2), + vfloat::loadu(&catmullrom_basis0.c3[size][ofs]) * Vec4vf(v3)))); + } + + template + __forceinline Vec4vf eval1(const int ofs, const int size) const + { + assert(size <= PrecomputedCatmullRomBasis::N); + assert(ofs <= size); + return madd(vfloat::loadu(&catmullrom_basis1.c0[size][ofs]), Vec4vf(v0), + madd(vfloat::loadu(&catmullrom_basis1.c1[size][ofs]), Vec4vf(v1), + madd(vfloat::loadu(&catmullrom_basis1.c2[size][ofs]), Vec4vf(v2), + vfloat::loadu(&catmullrom_basis1.c3[size][ofs]) * Vec4vf(v3)))); + } + + template + __forceinline Vec4vf derivative0(const int ofs, const int size) const + { + assert(size <= PrecomputedCatmullRomBasis::N); + assert(ofs <= size); + return madd(vfloat::loadu(&catmullrom_basis0.d0[size][ofs]), Vec4vf(v0), + madd(vfloat::loadu(&catmullrom_basis0.d1[size][ofs]), Vec4vf(v1), + madd(vfloat::loadu(&catmullrom_basis0.d2[size][ofs]), Vec4vf(v2), + vfloat::loadu(&catmullrom_basis0.d3[size][ofs]) * Vec4vf(v3)))); + } + + template + __forceinline Vec4vf derivative1(const int ofs, const int size) const + { + assert(size <= PrecomputedCatmullRomBasis::N); + assert(ofs <= size); + return madd(vfloat::loadu(&catmullrom_basis1.d0[size][ofs]), Vec4vf(v0), + madd(vfloat::loadu(&catmullrom_basis1.d1[size][ofs]), Vec4vf(v1), + madd(vfloat::loadu(&catmullrom_basis1.d2[size][ofs]), Vec4vf(v2), + vfloat::loadu(&catmullrom_basis1.d3[size][ofs]) * Vec4vf(v3)))); + } + + /* calculates bounds of catmull-rom curve geometry */ + __forceinline BBox3fa accurateRoundBounds() const + { + const int N = 7; + const float scale = 1.0f/(3.0f*(N-1)); + Vec4vfx pl(pos_inf), pu(neg_inf); + for (int i=0; i<=N; i+=VSIZEX) + { + vintx vi = vintx(i)+vintx(step); + vboolx valid = vi <= vintx(N); + const Vec4vfx p = eval0(i,N); + const Vec4vfx dp = derivative0(i,N); + const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero)); + const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero)); + pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min + pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min + } + const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); + const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); + const float r_min = reduce_min(pl.w); + const float r_max = reduce_max(pu.w); + const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max))); + return enlarge(BBox3fa(lower,upper),upper_r); + } + + /* calculates bounds when tessellated into N line segments */ + __forceinline BBox3fa accurateFlatBounds(int N) const + { + if (likely(N == 4)) + { + const Vec4vf4 pi = eval0<4>(0,4); + const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z)); + const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z)); + const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w))); + const Vec3ff pe = end(); + return enlarge(BBox3fa(min(lower,pe),max(upper,pe)),max(upper_r,Vec3fa(abs(pe.w)))); + } + else + { + Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f); + for (int i=0; i<=N; i+=VSIZEX) + { + vboolx valid = vintx(i)+vintx(step) <= vintx(N); + const Vec4vfx pi = eval0(i,N); + + pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min + pl.y = select(valid,min(pl.y,pi.y),pl.y); + pl.z = select(valid,min(pl.z,pi.z),pl.z); + + pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min + pu.y = select(valid,max(pu.y,pi.y),pu.y); + pu.z = select(valid,max(pu.z,pi.z),pu.z); + + ru = select(valid,max(ru,abs(pi.w)),ru); + } + const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z)); + const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z)); + const Vec3fa upper_r(reduce_max(ru)); + return enlarge(BBox3fa(lower,upper),upper_r); + } + } + + friend __forceinline embree_ostream operator<<(embree_ostream cout, const CatmullRomCurveT& curve) { + return cout << "CatmullRomCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }"; + } + }; + + __forceinline CatmullRomCurveT enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CatmullRomCurveT& curve) + { + return CatmullRomCurveT(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2), + enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3)); + } + + typedef CatmullRomCurveT CatmullRomCurve3fa; +} + diff --git a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval.h b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval.h new file mode 100644 index 00000000000..23f24c360c8 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval.h @@ -0,0 +1,226 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "patch.h" + +namespace embree +{ + namespace isa + { + template + struct FeatureAdaptiveEval + { + public: + + typedef PatchT Patch; + typedef typename Patch::Ref Ref; + typedef GeneralCatmullClarkPatchT GeneralCatmullClarkPatch; + typedef CatmullClark1RingT CatmullClarkRing; + typedef CatmullClarkPatchT CatmullClarkPatch; + typedef BSplinePatchT BSplinePatch; + typedef BezierPatchT BezierPatch; + typedef GregoryPatchT GregoryPatch; + typedef BilinearPatchT BilinearPatch; + typedef BezierCurveT BezierCurve; + + public: + + FeatureAdaptiveEval (const HalfEdge* edge, const char* vertices, size_t stride, const float u, const float v, + Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv) + : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv) + { + switch (edge->patch_type) { + case HalfEdge::BILINEAR_PATCH: BilinearPatch(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break; + case HalfEdge::REGULAR_QUAD_PATCH: RegularPatchT(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break; +#if PATCH_USE_GREGORY == 2 + case HalfEdge::IRREGULAR_QUAD_PATCH: GregoryPatch(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break; +#endif + default: { + GeneralCatmullClarkPatch patch(edge,vertices,stride); + eval(patch,Vec2f(u,v),0); + break; + } + } + } + + FeatureAdaptiveEval (CatmullClarkPatch& patch, const float u, const float v, float dscale, size_t depth, + Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv) + : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv) + { + eval(patch,Vec2f(u,v),dscale,depth); + } + + void eval_general_quad(const GeneralCatmullClarkPatch& patch, array_t& patches, const Vec2f& uv, size_t depth) + { + float u = uv.x, v = uv.y; + if (v < 0.5f) { + if (u < 0.5f) { +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[2]; patch.getLimitBorder(borders,0); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); + eval(patches[0],Vec2f(2.0f*u,2.0f*v),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r); +#else + eval(patches[0],Vec2f(2.0f*u,2.0f*v),2.0f,depth+1); +#endif + if (dPdu && dPdv) { + const Vertex dpdx = *dPdu, dpdy = *dPdv; + *dPdu = dpdx; *dPdv = dpdy; + } + } + else { +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[2]; patch.getLimitBorder(borders,1); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); + eval(patches[1],Vec2f(2.0f*v,2.0f-2.0f*u),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r); +#else + eval(patches[1],Vec2f(2.0f*v,2.0f-2.0f*u),2.0f,depth+1); +#endif + if (dPdu && dPdv) { + const Vertex dpdx = *dPdu, dpdy = *dPdv; + *dPdu = -dpdy; *dPdv = dpdx; + } + } + } else { + if (u > 0.5f) { +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[2]; patch.getLimitBorder(borders,2); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); + eval(patches[2],Vec2f(2.0f-2.0f*u,2.0f-2.0f*v),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r); +#else + eval(patches[2],Vec2f(2.0f-2.0f*u,2.0f-2.0f*v),2.0f,depth+1); +#endif + if (dPdu && dPdv) { + const Vertex dpdx = *dPdu, dpdy = *dPdv; + *dPdu = -dpdx; *dPdv = -dpdy; + } + } + else { +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[2]; patch.getLimitBorder(borders,3); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); + eval(patches[3],Vec2f(2.0f-2.0f*v,2.0f*u),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r); +#else + eval(patches[3],Vec2f(2.0f-2.0f*v,2.0f*u),2.0f,depth+1); +#endif + if (dPdu && dPdv) { + const Vertex dpdx = *dPdu, dpdy = *dPdv; + *dPdu = dpdy; *dPdv = -dpdx; + } + } + } + } + + __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) + { + const int max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR; +//#if PATCH_MIN_RESOLUTION +// return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=(size_t)max_eval_depth; +//#else + return depth>=(size_t)max_eval_depth; +//#endif + } + + void eval(CatmullClarkPatch& patch, Vec2f uv, float dscale, size_t depth, + BezierCurve* border0 = nullptr, BezierCurve* border1 = nullptr, BezierCurve* border2 = nullptr, BezierCurve* border3 = nullptr) + { + while (true) + { + typename CatmullClarkPatch::Type ty = patch.type(); + + if (unlikely(final(patch,ty,depth))) + { + if (ty & CatmullClarkRing::TYPE_REGULAR) { + RegularPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(234423,c,c,-1); + return; + } else { + IrregularFillPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(34534,c,-1,c); + return; + } + } + else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { + assert(depth > 0); + RegularPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(43524,c,c,-1); + return; + } +#if PATCH_USE_GREGORY == 2 + else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { + assert(depth > 0); + GregoryPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(23498,c,-1,c); + return; + } +#endif + else + { + array_t patches; + patch.subdivide(patches); // FIXME: only have to generate one of the patches + + const float u = uv.x, v = uv.y; + if (v < 0.5f) { + if (u < 0.5f) { patch = patches[0]; uv = Vec2f(2.0f*u,2.0f*v); dscale *= 2.0f; } + else { patch = patches[1]; uv = Vec2f(2.0f*u-1.0f,2.0f*v); dscale *= 2.0f; } + } else { + if (u > 0.5f) { patch = patches[2]; uv = Vec2f(2.0f*u-1.0f,2.0f*v-1.0f); dscale *= 2.0f; } + else { patch = patches[3]; uv = Vec2f(2.0f*u,2.0f*v-1.0f); dscale *= 2.0f; } + } + depth++; + } + } + } + + void eval(const GeneralCatmullClarkPatch& patch, const Vec2f& uv, const size_t depth) + { + /* convert into standard quad patch if possible */ + if (likely(patch.isQuadPatch())) + { + CatmullClarkPatch qpatch; patch.init(qpatch); + return eval(qpatch,uv,1.0f,depth); + } + + /* subdivide patch */ + unsigned N; + array_t patches; + patch.subdivide(patches,N); // FIXME: only have to generate one of the patches + + /* parametrization for quads */ + if (N == 4) + eval_general_quad(patch,patches,uv,depth); + + /* parametrization for arbitrary polygons */ + else + { + const unsigned l = (unsigned) floor(0.5f*uv.x); const float u = 2.0f*frac(0.5f*uv.x)-0.5f; + const unsigned h = (unsigned) floor(0.5f*uv.y); const float v = 2.0f*frac(0.5f*uv.y)-0.5f; + const unsigned i = 4*h+l; assert(i= N) return; + +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[2]; patch.getLimitBorder(borders,i); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r); + eval(patches[i],Vec2f(u,v),1.0f,depth+1, &border0l, nullptr, nullptr, &border2r); +#else + eval(patches[i],Vec2f(u,v),1.0f,depth+1); +#endif + } + } + + private: + Vertex* const P; + Vertex* const dPdu; + Vertex* const dPdv; + Vertex* const ddPdudu; + Vertex* const ddPdvdv; + Vertex* const ddPdudv; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_grid.h b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_grid.h new file mode 100644 index 00000000000..76583b2e5dc --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_grid.h @@ -0,0 +1,359 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "patch.h" +#include "catmullclark_patch.h" +#include "bspline_patch.h" +#include "gregory_patch.h" +#include "tessellation.h" + +namespace embree +{ + namespace isa + { + struct FeatureAdaptiveEvalGrid + { + typedef CatmullClark1Ring3fa CatmullClarkRing; + typedef CatmullClarkPatch3fa CatmullClarkPatch; + typedef BilinearPatch3fa BilinearPatch; + typedef BSplinePatch3fa BSplinePatch; + typedef BezierPatch3fa BezierPatch; + typedef GregoryPatch3fa GregoryPatch; + + private: + const unsigned x0,x1; + const unsigned y0,y1; + const unsigned swidth,sheight; + const float rcp_swidth, rcp_sheight; + float* const Px; + float* const Py; + float* const Pz; + float* const U; + float* const V; + float* const Nx; + float* const Ny; + float* const Nz; + const unsigned dwidth; + //const unsigned dheight; + unsigned count; + + + public: + FeatureAdaptiveEvalGrid (const GeneralCatmullClarkPatch3fa& patch, unsigned subPatch, + const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, + float* Px, float* Py, float* Pz, float* U, float* V, + float* Nx, float* Ny, float* Nz, + const unsigned dwidth, const unsigned dheight) + : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), + Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), /*dheight(dheight),*/ count(0) + { + assert(swidth < (2<<20) && sheight < (2<<20)); + const BBox2f srange(Vec2f(0.0f,0.0f),Vec2f(float(swidth-1),float(sheight-1))); + const BBox2f erange(Vec2f((float)x0,(float)y0),Vec2f((float)x1,(float)y1)); + + /* convert into standard quad patch if possible */ + if (likely(patch.isQuadPatch())) + { + CatmullClarkPatch3fa qpatch; patch.init(qpatch); + eval(qpatch, srange, erange, 0); + assert(count == (x1-x0+1)*(y1-y0+1)); + return; + } + + /* subdivide patch */ + unsigned N; + array_t patches; + patch.subdivide(patches,N); + + if (N == 4) + { + const Vec2f c = srange.center(); + const BBox2f srange0(srange.lower,c); + const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y)); + const BBox2f srange2(c,srange.upper); + const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y)); + +#if PATCH_USE_GREGORY == 2 + BezierCurve3fa borders[GeneralCatmullClarkPatch3fa::SIZE]; patch.getLimitBorder(borders); + BezierCurve3fa border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve3fa border1l,border1r; borders[1].subdivide(border1l,border1r); + BezierCurve3fa border2l,border2r; borders[2].subdivide(border2l,border2r); + BezierCurve3fa border3l,border3r; borders[3].subdivide(border3l,border3r); + GeneralCatmullClarkPatch3fa::fix_quad_ring_order(patches); + eval(patches[0],srange0,intersect(srange0,erange),1,&border0l,nullptr,nullptr,&border3r); + eval(patches[1],srange1,intersect(srange1,erange),1,&border0r,&border1l,nullptr,nullptr); + eval(patches[2],srange2,intersect(srange2,erange),1,nullptr,&border1r,&border2l,nullptr); + eval(patches[3],srange3,intersect(srange3,erange),1,nullptr,nullptr,&border2r,&border3l); +#else + GeneralCatmullClarkPatch3fa::fix_quad_ring_order(patches); + eval(patches[0],srange0,intersect(srange0,erange),1); + eval(patches[1],srange1,intersect(srange1,erange),1); + eval(patches[2],srange2,intersect(srange2,erange),1); + eval(patches[3],srange3,intersect(srange3,erange),1); +#endif + } + else + { + assert(subPatch < N); + +#if PATCH_USE_GREGORY == 2 + BezierCurve3fa borders[2]; patch.getLimitBorder(borders,subPatch); + BezierCurve3fa border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve3fa border2l,border2r; borders[1].subdivide(border2l,border2r); + eval(patches[subPatch], srange, erange, 1, &border0l, nullptr, nullptr, &border2r); +#else + eval(patches[subPatch], srange, erange, 1); +#endif + + } + assert(count == (x1-x0+1)*(y1-y0+1)); + } + + FeatureAdaptiveEvalGrid (const CatmullClarkPatch3fa& patch, + const BBox2f& srange, const BBox2f& erange, const unsigned depth, + const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, + float* Px, float* Py, float* Pz, float* U, float* V, + float* Nx, float* Ny, float* Nz, + const unsigned dwidth, const unsigned dheight) + : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), + Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), /*dheight(dheight),*/ count(0) + { + eval(patch,srange,erange,depth); + } + + template + void evalLocalGrid(const Patch& patch, const BBox2f& srange, const int lx0, const int lx1, const int ly0, const int ly1) + { + const float scale_x = rcp(srange.upper.x-srange.lower.x); + const float scale_y = rcp(srange.upper.y-srange.lower.y); + count += (lx1-lx0)*(ly1-ly0); + +#if 0 + for (unsigned iy=ly0; iy=max_eval_depth; +//#else + return depth>=max_eval_depth; +//#endif + } + + void eval(const CatmullClarkPatch3fa& patch, const BBox2f& srange, const BBox2f& erange, const unsigned depth, + const BezierCurve3fa* border0 = nullptr, const BezierCurve3fa* border1 = nullptr, const BezierCurve3fa* border2 = nullptr, const BezierCurve3fa* border3 = nullptr) + { + if (erange.empty()) + return; + + int lx0 = (int) ceilf(erange.lower.x); + int lx1 = (int) ceilf(erange.upper.x) + (erange.upper.x == x1 && (srange.lower.x < erange.upper.x || erange.upper.x == 0)); + int ly0 = (int) ceilf(erange.lower.y); + int ly1 = (int) ceilf(erange.upper.y) + (erange.upper.y == y1 && (srange.lower.y < erange.upper.y || erange.upper.y == 0)); + if (lx0 >= lx1 || ly0 >= ly1) return; + + CatmullClarkPatch::Type ty = patch.type(); + + if (unlikely(final(patch,ty,depth))) + { + if (ty & CatmullClarkRing::TYPE_REGULAR) { + RegularPatch rpatch(patch,border0,border1,border2,border3); + evalLocalGrid(rpatch,srange,lx0,lx1,ly0,ly1); + return; + } else { + IrregularFillPatch ipatch(patch,border0,border1,border2,border3); + evalLocalGrid(ipatch,srange,lx0,lx1,ly0,ly1); + return; + } + } + else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { + assert(depth > 0); + RegularPatch rpatch(patch,border0,border1,border2,border3); + evalLocalGrid(rpatch,srange,lx0,lx1,ly0,ly1); + return; + } +#if PATCH_USE_GREGORY == 2 + else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { + assert(depth > 0); + GregoryPatch gpatch(patch,border0,border1,border2,border3); + evalLocalGrid(gpatch,srange,lx0,lx1,ly0,ly1); + } +#endif + else + { + array_t patches; + patch.subdivide(patches); + + const Vec2f c = srange.center(); + const BBox2f srange0(srange.lower,c); + const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y)); + const BBox2f srange2(c,srange.upper); + const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y)); + + eval(patches[0],srange0,intersect(srange0,erange),depth+1); + eval(patches[1],srange1,intersect(srange1,erange),depth+1); + eval(patches[2],srange2,intersect(srange2,erange),depth+1); + eval(patches[3],srange3,intersect(srange3,erange),depth+1); + } + } + }; + + template + bool stitch_col(const Patch& patch, int subPatch, + const bool right, const unsigned y0, const unsigned y1, const int fine_y, const int coarse_y, + float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dx0, const unsigned dwidth, const unsigned dheight) + { + assert(coarse_y <= fine_y); + if (likely(fine_y == coarse_y)) + return false; + + const unsigned y0s = stitch(y0,fine_y,coarse_y); + const unsigned y1s = stitch(y1,fine_y,coarse_y); + const unsigned M = y1s-y0s+1 + VSIZEX; + + dynamic_large_stack_array(float,px,M,64*sizeof(float)); + dynamic_large_stack_array(float,py,M,64*sizeof(float)); + dynamic_large_stack_array(float,pz,M,64*sizeof(float)); + dynamic_large_stack_array(float,u,M,64*sizeof(float)); + dynamic_large_stack_array(float,v,M,64*sizeof(float)); + dynamic_large_stack_array(float,nx,M,64*sizeof(float)); + dynamic_large_stack_array(float,ny,M,64*sizeof(float)); + dynamic_large_stack_array(float,nz,M,64*sizeof(float)); + const bool has_Nxyz = Nx; assert(!Nx || (Ny && Nz)); + Eval(patch,subPatch, right,right, y0s,y1s, 2,coarse_y+1, px,py,pz,u,v, + has_Nxyz ? (float*)nx : nullptr,has_Nxyz ? (float*)ny : nullptr ,has_Nxyz ? (float*)nz : nullptr, 1,4097); + + for (unsigned y=y0; y<=y1; y++) + { + const unsigned ys = stitch(y,fine_y,coarse_y)-y0s; + Px[(y-y0)*dwidth+dx0] = px[ys]; + Py[(y-y0)*dwidth+dx0] = py[ys]; + Pz[(y-y0)*dwidth+dx0] = pz[ys]; + U [(y-y0)*dwidth+dx0] = u[ys]; + V [(y-y0)*dwidth+dx0] = v[ys]; + if (unlikely(has_Nxyz)) { + Nx[(y-y0)*dwidth+dx0] = nx[ys]; + Ny[(y-y0)*dwidth+dx0] = ny[ys]; + Nz[(y-y0)*dwidth+dx0] = nz[ys]; + } + } + return true; + } + + template + bool stitch_row(const Patch& patch, int subPatch, + const bool bottom, const unsigned x0, const unsigned x1, const int fine_x, const int coarse_x, + float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dy0, const unsigned dwidth, const unsigned dheight) + { + assert(coarse_x <= fine_x); + if (likely(fine_x == coarse_x)) + return false; + + const unsigned x0s = stitch(x0,fine_x,coarse_x); + const unsigned x1s = stitch(x1,fine_x,coarse_x); + const unsigned M = x1s-x0s+1 + VSIZEX; + + dynamic_large_stack_array(float,px,M,32*sizeof(float)); + dynamic_large_stack_array(float,py,M,32*sizeof(float)); + dynamic_large_stack_array(float,pz,M,32*sizeof(float)); + dynamic_large_stack_array(float,u,M,32*sizeof(float)); + dynamic_large_stack_array(float,v,M,32*sizeof(float)); + dynamic_large_stack_array(float,nx,M,32*sizeof(float)); + dynamic_large_stack_array(float,ny,M,32*sizeof(float)); + dynamic_large_stack_array(float,nz,M,32*sizeof(float)); + const bool has_Nxyz = Nx; assert(!Nx || (Ny && Nz)); + Eval(patch,subPatch, x0s,x1s, bottom,bottom, coarse_x+1,2, px,py,pz,u,v, + has_Nxyz ? (float*)nx :nullptr, has_Nxyz ? (float*)ny : nullptr , has_Nxyz ? (float*)nz : nullptr, 4097,1); + + for (unsigned x=x0; x<=x1; x++) + { + const unsigned xs = stitch(x,fine_x,coarse_x)-x0s; + Px[dy0*dwidth+x-x0] = px[xs]; + Py[dy0*dwidth+x-x0] = py[xs]; + Pz[dy0*dwidth+x-x0] = pz[xs]; + U [dy0*dwidth+x-x0] = u[xs]; + V [dy0*dwidth+x-x0] = v[xs]; + if (unlikely(has_Nxyz)) { + Nx[dy0*dwidth+x-x0] = nx[xs]; + Ny[dy0*dwidth+x-x0] = ny[xs]; + Nz[dy0*dwidth+x-x0] = nz[xs]; + } + } + return true; + } + + template + void feature_adaptive_eval_grid (const Patch& patch, unsigned subPatch, const float levels[4], + const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, + float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dwidth, const unsigned dheight) + { + bool sl = false, sr = false, st = false, sb = false; + if (levels) { + sl = x0 == 0 && stitch_col(patch,subPatch,0,y0,y1,sheight-1,int(levels[3]), Px,Py,Pz,U,V,Nx,Ny,Nz, 0 ,dwidth,dheight); + sr = x1 == swidth-1 && stitch_col(patch,subPatch,1,y0,y1,sheight-1,int(levels[1]), Px,Py,Pz,U,V,Nx,Ny,Nz, x1-x0,dwidth,dheight); + st = y0 == 0 && stitch_row(patch,subPatch,0,x0,x1,swidth-1,int(levels[0]), Px,Py,Pz,U,V,Nx,Ny,Nz, 0 ,dwidth,dheight); + sb = y1 == sheight-1 && stitch_row(patch,subPatch,1,x0,x1,swidth-1,int(levels[2]), Px,Py,Pz,U,V,Nx,Ny,Nz, y1-y0,dwidth,dheight); + } + const unsigned ofs = st*dwidth+sl; + Eval(patch,subPatch,x0+sl,x1-sr,y0+st,y1-sb, swidth,sheight, Px+ofs,Py+ofs,Pz+ofs,U+ofs,V+ofs,Nx?Nx+ofs:nullptr,Ny?Ny+ofs:nullptr,Nz?Nz+ofs:nullptr, dwidth,dheight); + } + } +} + diff --git a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_simd.h b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_simd.h new file mode 100644 index 00000000000..fa3216730f8 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_simd.h @@ -0,0 +1,186 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "patch.h" + +namespace embree +{ + namespace isa + { + template + struct FeatureAdaptiveEvalSimd + { + public: + + typedef PatchT Patch; + typedef typename Patch::Ref Ref; + typedef GeneralCatmullClarkPatchT GeneralCatmullClarkPatch; + typedef CatmullClark1RingT CatmullClarkRing; + typedef CatmullClarkPatchT CatmullClarkPatch; + typedef BSplinePatchT BSplinePatch; + typedef BezierPatchT BezierPatch; + typedef GregoryPatchT GregoryPatch; + typedef BilinearPatchT BilinearPatch; + typedef BezierCurveT BezierCurve; + + FeatureAdaptiveEvalSimd (const HalfEdge* edge, const char* vertices, size_t stride, const vbool& valid, const vfloat& u, const vfloat& v, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N) + : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N) + { + switch (edge->patch_type) { + case HalfEdge::BILINEAR_PATCH: BilinearPatch(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break; + case HalfEdge::REGULAR_QUAD_PATCH: RegularPatchT(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break; +#if PATCH_USE_GREGORY == 2 + case HalfEdge::IRREGULAR_QUAD_PATCH: GregoryPatchT(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break; +#endif + default: { + GeneralCatmullClarkPatch patch(edge,vertices,stride); + eval_direct(valid,patch,Vec2(u,v),0); + break; + } + } + } + + FeatureAdaptiveEvalSimd (const CatmullClarkPatch& patch, const vbool& valid, const vfloat& u, const vfloat& v, float dscale, size_t depth, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N) + : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N) + { + eval_direct(valid,patch,Vec2(u,v),dscale,depth); + } + + template + __forceinline void eval_quad_direct(const vbool& valid, array_t& patches, const Vec2& uv, float dscale, size_t depth) + { + const vfloat u = uv.x, v = uv.y; + const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f; + const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f; + const vbool u0v0_mask = valid & u0_mask & v0_mask; + const vbool u0v1_mask = valid & u0_mask & v1_mask; + const vbool u1v0_mask = valid & u1_mask & v0_mask; + const vbool u1v1_mask = valid & u1_mask & v1_mask; + if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2(2.0f*u,2.0f*v),2.0f*dscale,depth+1); + if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1); + if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1); + if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1); + } + + template + __forceinline void eval_general_quad_direct(const vbool& valid, const GeneralCatmullClarkPatch& patch, array_t& patches, const Vec2& uv, float dscale, size_t depth) + { +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; patch.getLimitBorder(borders); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border1l,border1r; borders[1].subdivide(border1l,border1r); + BezierCurve border2l,border2r; borders[2].subdivide(border2l,border2r); + BezierCurve border3l,border3r; borders[3].subdivide(border3l,border3r); +#endif + GeneralCatmullClarkPatch::fix_quad_ring_order(patches); + const vfloat u = uv.x, v = uv.y; + const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f; + const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f; + const vbool u0v0_mask = valid & u0_mask & v0_mask; + const vbool u0v1_mask = valid & u0_mask & v1_mask; + const vbool u1v0_mask = valid & u1_mask & v0_mask; + const vbool u1v1_mask = valid & u1_mask & v1_mask; +#if PATCH_USE_GREGORY == 2 + if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2(2.0f*u,2.0f*v),2.0f*dscale,depth+1,&border0l,nullptr,nullptr,&border3r); + if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1,&border0r,&border1l,nullptr,nullptr); + if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1,nullptr,&border1r,&border2l,nullptr); + if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1,nullptr,nullptr,&border2r,&border3l); +#else + if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2(2.0f*u,2.0f*v),2.0f*dscale,depth+1); + if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1); + if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1); + if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1); +#endif + } + + __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) + { + const size_t max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR; +//#if PATCH_MIN_RESOLUTION +// return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth; +//#else + return depth>=max_eval_depth; +//#endif + } + + void eval_direct(const vbool& valid, const CatmullClarkPatch& patch, const Vec2& uv, float dscale, size_t depth, + BezierCurve* border0 = nullptr, BezierCurve* border1 = nullptr, BezierCurve* border2 = nullptr, BezierCurve* border3 = nullptr) + { + typename CatmullClarkPatch::Type ty = patch.type(); + + if (unlikely(final(patch,ty,depth))) + { + if (ty & CatmullClarkRing::TYPE_REGULAR) { + RegularPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + } else { + IrregularFillPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + } + } + else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { + assert(depth > 0); RegularPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + } +#if PATCH_USE_GREGORY == 2 + else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { + assert(depth > 0); GregoryPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + } +#endif + else + { + array_t patches; + patch.subdivide(patches); // FIXME: only have to generate one of the patches + eval_quad_direct(valid,patches,uv,dscale,depth); + } + } + + void eval_direct(const vbool& valid, const GeneralCatmullClarkPatch& patch, const Vec2& uv, const size_t depth) + { + /* convert into standard quad patch if possible */ + if (likely(patch.isQuadPatch())) { + CatmullClarkPatch qpatch; patch.init(qpatch); + return eval_direct(valid,qpatch,uv,1.0f,depth); + } + + /* subdivide patch */ + unsigned Nc; + array_t patches; + patch.subdivide(patches,Nc); // FIXME: only have to generate one of the patches + + /* parametrization for quads */ + if (Nc == 4) + eval_general_quad_direct(valid,patch,patches,uv,1.0f,depth); + + /* parametrization for arbitrary polygons */ + else + { + const vint l = (vint)floor(0.5f*uv.x); const vfloat u = 2.0f*frac(0.5f*uv.x)-0.5f; + const vint h = (vint)floor(0.5f*uv.y); const vfloat v = 2.0f*frac(0.5f*uv.y)-0.5f; + const vint i = (h<<2)+l; assert(all(valid,i(u,v),1.0f,depth+1, &border0l, nullptr, nullptr, &border2r); +#else + eval_direct(valid,patches[i],Vec2(u,v),1.0f,depth+1); +#endif + }); + } + } + + private: + float* const P; + float* const dPdu; + float* const dPdv; + float* const ddPdudu; + float* const ddPdvdv; + float* const ddPdudv; + const size_t dstride; + const size_t N; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch.h new file mode 100644 index 00000000000..2a7c4b1f2c2 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch.h @@ -0,0 +1,893 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "catmullclark_patch.h" +#include "bezier_patch.h" +#include "bezier_curve.h" +#include "catmullclark_coefficients.h" + +namespace embree +{ + template + class __aligned(64) GregoryPatchT + { + typedef CatmullClarkPatchT CatmullClarkPatch; + typedef GeneralCatmullClarkPatchT GeneralCatmullClarkPatch; + typedef CatmullClark1RingT CatmullClark1Ring; + typedef BezierCurveT BezierCurve; + + public: + Vertex v[4][4]; + Vertex f[2][2]; + + __forceinline GregoryPatchT() {} + + __forceinline GregoryPatchT(const CatmullClarkPatch& patch) { + init(patch); + } + + __forceinline GregoryPatchT(const CatmullClarkPatch& patch, + const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) + { + init_crackfix(patch,border0,border1,border2,border3); + } + + __forceinline GregoryPatchT (const HalfEdge* edge, const char* vertices, size_t stride) { + init(CatmullClarkPatch(edge,vertices,stride)); + } + + __forceinline Vertex& p0() { return v[0][0]; } + __forceinline Vertex& p1() { return v[0][3]; } + __forceinline Vertex& p2() { return v[3][3]; } + __forceinline Vertex& p3() { return v[3][0]; } + + __forceinline Vertex& e0_p() { return v[0][1]; } + __forceinline Vertex& e0_m() { return v[1][0]; } + __forceinline Vertex& e1_p() { return v[1][3]; } + __forceinline Vertex& e1_m() { return v[0][2]; } + __forceinline Vertex& e2_p() { return v[3][2]; } + __forceinline Vertex& e2_m() { return v[2][3]; } + __forceinline Vertex& e3_p() { return v[2][0]; } + __forceinline Vertex& e3_m() { return v[3][1]; } + + __forceinline Vertex& f0_p() { return v[1][1]; } + __forceinline Vertex& f1_p() { return v[1][2]; } + __forceinline Vertex& f2_p() { return v[2][2]; } + __forceinline Vertex& f3_p() { return v[2][1]; } + __forceinline Vertex& f0_m() { return f[0][0]; } + __forceinline Vertex& f1_m() { return f[0][1]; } + __forceinline Vertex& f2_m() { return f[1][1]; } + __forceinline Vertex& f3_m() { return f[1][0]; } + + __forceinline const Vertex& p0() const { return v[0][0]; } + __forceinline const Vertex& p1() const { return v[0][3]; } + __forceinline const Vertex& p2() const { return v[3][3]; } + __forceinline const Vertex& p3() const { return v[3][0]; } + + __forceinline const Vertex& e0_p() const { return v[0][1]; } + __forceinline const Vertex& e0_m() const { return v[1][0]; } + __forceinline const Vertex& e1_p() const { return v[1][3]; } + __forceinline const Vertex& e1_m() const { return v[0][2]; } + __forceinline const Vertex& e2_p() const { return v[3][2]; } + __forceinline const Vertex& e2_m() const { return v[2][3]; } + __forceinline const Vertex& e3_p() const { return v[2][0]; } + __forceinline const Vertex& e3_m() const { return v[3][1]; } + + __forceinline const Vertex& f0_p() const { return v[1][1]; } + __forceinline const Vertex& f1_p() const { return v[1][2]; } + __forceinline const Vertex& f2_p() const { return v[2][2]; } + __forceinline const Vertex& f3_p() const { return v[2][1]; } + __forceinline const Vertex& f0_m() const { return f[0][0]; } + __forceinline const Vertex& f1_m() const { return f[0][1]; } + __forceinline const Vertex& f2_m() const { return f[1][1]; } + __forceinline const Vertex& f3_m() const { return f[1][0]; } + + __forceinline Vertex initCornerVertex(const CatmullClarkPatch& irreg_patch, const size_t index) { + return irreg_patch.ring[index].getLimitVertex(); + } + + __forceinline Vertex initPositiveEdgeVertex(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) { + return madd(1.0f/3.0f,irreg_patch.ring[index].getLimitTangent(),p_vtx); + } + + __forceinline Vertex initNegativeEdgeVertex(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) { + return madd(1.0f/3.0f,irreg_patch.ring[index].getSecondLimitTangent(),p_vtx); + } + + __forceinline Vertex initPositiveEdgeVertex2(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) + { + CatmullClark1Ring3fa r0,r1,r2; + irreg_patch.ring[index].subdivide(r0); + r0.subdivide(r1); + r1.subdivide(r2); + return madd(8.0f/3.0f,r2.getLimitTangent(),p_vtx); + } + + __forceinline Vertex initNegativeEdgeVertex2(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) + { + CatmullClark1Ring3fa r0,r1,r2; + irreg_patch.ring[index].subdivide(r0); + r0.subdivide(r1); + r1.subdivide(r2); + return madd(8.0f/3.0f,r2.getSecondLimitTangent(),p_vtx); + } + + void initFaceVertex(const CatmullClarkPatch& irreg_patch, + const size_t index, + const Vertex& p_vtx, + const Vertex& e0_p_vtx, + const Vertex& e1_m_vtx, + const unsigned int face_valence_p1, + const Vertex& e0_m_vtx, + const Vertex& e3_p_vtx, + const unsigned int face_valence_p3, + Vertex& f_p_vtx, + Vertex& f_m_vtx) + { + const unsigned int face_valence = irreg_patch.ring[index].face_valence; + const unsigned int edge_valence = irreg_patch.ring[index].edge_valence; + const unsigned int border_index = irreg_patch.ring[index].border_index; + + const Vertex& vtx = irreg_patch.ring[index].vtx; + const Vertex e_i = irreg_patch.ring[index].getEdgeCenter(0); + const Vertex c_i_m_1 = irreg_patch.ring[index].getQuadCenter(0); + const Vertex e_i_m_1 = irreg_patch.ring[index].getEdgeCenter(1); + + Vertex c_i, e_i_p_1; + const bool hasHardEdge0 = + std::isinf(irreg_patch.ring[index].vertex_crease_weight) && + std::isinf(irreg_patch.ring[index].crease_weight[0]); + + if (unlikely((border_index == edge_valence-2) || hasHardEdge0)) + { + /* mirror quad center and edge mid-point */ + c_i = madd(2.0f, e_i - c_i_m_1, c_i_m_1); + e_i_p_1 = madd(2.0f, vtx - e_i_m_1, e_i_m_1); + } + else + { + c_i = irreg_patch.ring[index].getQuadCenter( face_valence-1 ); + e_i_p_1 = irreg_patch.ring[index].getEdgeCenter( face_valence-1 ); + } + + Vertex c_i_m_2, e_i_m_2; + const bool hasHardEdge1 = + std::isinf(irreg_patch.ring[index].vertex_crease_weight) && + std::isinf(irreg_patch.ring[index].crease_weight[1]); + + if (unlikely(border_index == 2 || hasHardEdge1)) + { + /* mirror quad center and edge mid-point */ + c_i_m_2 = madd(2.0f, e_i_m_1 - c_i_m_1, c_i_m_1); + e_i_m_2 = madd(2.0f, vtx - e_i, + e_i); + } + else + { + c_i_m_2 = irreg_patch.ring[index].getQuadCenter( 1 ); + e_i_m_2 = irreg_patch.ring[index].getEdgeCenter( 2 ); + } + + const float d = 3.0f; + //const float c = cosf(2.0f*M_PI/(float)face_valence); + //const float c_e_p = cosf(2.0f*M_PI/(float)face_valence_p1); + //const float c_e_m = cosf(2.0f*M_PI/(float)face_valence_p3); + + const float c = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence); + const float c_e_p = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p1); + const float c_e_m = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p3); + + const Vertex r_e_p = 1.0f/3.0f * (e_i_m_1 - e_i_p_1) + 2.0f/3.0f * (c_i_m_1 - c_i); + const Vertex r_e_m = 1.0f/3.0f * (e_i - e_i_m_2) + 2.0f/3.0f * (c_i_m_1 - c_i_m_2); + + f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p); + f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m); + } + + __noinline void init(const CatmullClarkPatch& patch) + { + assert( patch.ring[0].hasValidPositions() ); + assert( patch.ring[1].hasValidPositions() ); + assert( patch.ring[2].hasValidPositions() ); + assert( patch.ring[3].hasValidPositions() ); + + p0() = initCornerVertex(patch,0); + p1() = initCornerVertex(patch,1); + p2() = initCornerVertex(patch,2); + p3() = initCornerVertex(patch,3); + + e0_p() = initPositiveEdgeVertex(patch,0, p0()); + e1_p() = initPositiveEdgeVertex(patch,1, p1()); + e2_p() = initPositiveEdgeVertex(patch,2, p2()); + e3_p() = initPositiveEdgeVertex(patch,3, p3()); + + e0_m() = initNegativeEdgeVertex(patch,0, p0()); + e1_m() = initNegativeEdgeVertex(patch,1, p1()); + e2_m() = initNegativeEdgeVertex(patch,2, p2()); + e3_m() = initNegativeEdgeVertex(patch,3, p3()); + + const unsigned int face_valence_p0 = patch.ring[0].face_valence; + const unsigned int face_valence_p1 = patch.ring[1].face_valence; + const unsigned int face_valence_p2 = patch.ring[2].face_valence; + const unsigned int face_valence_p3 = patch.ring[3].face_valence; + + initFaceVertex(patch,0,p0(),e0_p(),e1_m(),face_valence_p1,e0_m(),e3_p(),face_valence_p3,f0_p(),f0_m() ); + initFaceVertex(patch,1,p1(),e1_p(),e2_m(),face_valence_p2,e1_m(),e0_p(),face_valence_p0,f1_p(),f1_m() ); + initFaceVertex(patch,2,p2(),e2_p(),e3_m(),face_valence_p3,e2_m(),e1_p(),face_valence_p1,f2_p(),f2_m() ); + initFaceVertex(patch,3,p3(),e3_p(),e0_m(),face_valence_p0,e3_m(),e2_p(),face_valence_p3,f3_p(),f3_m() ); + + } + + __noinline void init_crackfix(const CatmullClarkPatch& patch, + const BezierCurve* border0, + const BezierCurve* border1, + const BezierCurve* border2, + const BezierCurve* border3) + { + assert( patch.ring[0].hasValidPositions() ); + assert( patch.ring[1].hasValidPositions() ); + assert( patch.ring[2].hasValidPositions() ); + assert( patch.ring[3].hasValidPositions() ); + + p0() = initCornerVertex(patch,0); + p1() = initCornerVertex(patch,1); + p2() = initCornerVertex(patch,2); + p3() = initCornerVertex(patch,3); + + e0_p() = initPositiveEdgeVertex(patch,0, p0()); + e1_p() = initPositiveEdgeVertex(patch,1, p1()); + e2_p() = initPositiveEdgeVertex(patch,2, p2()); + e3_p() = initPositiveEdgeVertex(patch,3, p3()); + + e0_m() = initNegativeEdgeVertex(patch,0, p0()); + e1_m() = initNegativeEdgeVertex(patch,1, p1()); + e2_m() = initNegativeEdgeVertex(patch,2, p2()); + e3_m() = initNegativeEdgeVertex(patch,3, p3()); + + if (unlikely(border0 != nullptr)) + { + p0() = border0->v0; + e0_p() = border0->v1; + e1_m() = border0->v2; + p1() = border0->v3; + } + + if (unlikely(border1 != nullptr)) + { + p1() = border1->v0; + e1_p() = border1->v1; + e2_m() = border1->v2; + p2() = border1->v3; + } + + if (unlikely(border2 != nullptr)) + { + p2() = border2->v0; + e2_p() = border2->v1; + e3_m() = border2->v2; + p3() = border2->v3; + } + + if (unlikely(border3 != nullptr)) + { + p3() = border3->v0; + e3_p() = border3->v1; + e0_m() = border3->v2; + p0() = border3->v3; + } + + const unsigned int face_valence_p0 = patch.ring[0].face_valence; + const unsigned int face_valence_p1 = patch.ring[1].face_valence; + const unsigned int face_valence_p2 = patch.ring[2].face_valence; + const unsigned int face_valence_p3 = patch.ring[3].face_valence; + + initFaceVertex(patch,0,p0(),e0_p(),e1_m(),face_valence_p1,e0_m(),e3_p(),face_valence_p3,f0_p(),f0_m() ); + initFaceVertex(patch,1,p1(),e1_p(),e2_m(),face_valence_p2,e1_m(),e0_p(),face_valence_p0,f1_p(),f1_m() ); + initFaceVertex(patch,2,p2(),e2_p(),e3_m(),face_valence_p3,e2_m(),e1_p(),face_valence_p1,f2_p(),f2_m() ); + initFaceVertex(patch,3,p3(),e3_p(),e0_m(),face_valence_p0,e3_m(),e2_p(),face_valence_p3,f3_p(),f3_m() ); + } + + + void computeGregoryPatchFacePoints(const unsigned int face_valence, + const Vertex& r_e_p, + const Vertex& r_e_m, + const Vertex& p_vtx, + const Vertex& e0_p_vtx, + const Vertex& e1_m_vtx, + const unsigned int face_valence_p1, + const Vertex& e0_m_vtx, + const Vertex& e3_p_vtx, + const unsigned int face_valence_p3, + Vertex& f_p_vtx, + Vertex& f_m_vtx, + const float d = 3.0f) + { + //const float c = cosf(2.0*M_PI/(float)face_valence); + //const float c_e_p = cosf(2.0*M_PI/(float)face_valence_p1); + //const float c_e_m = cosf(2.0*M_PI/(float)face_valence_p3); + + const float c = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence); + const float c_e_p = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p1); + const float c_e_m = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p3); + + + f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p); + f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m); + f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p); + f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m); + } + + __noinline void init(const GeneralCatmullClarkPatch& patch) + { + assert(patch.size() == 4); +#if 0 + CatmullClarkPatch qpatch; patch.init(qpatch); + init(qpatch); +#else + const float face_valence_p0 = patch.ring[0].face_valence; + const float face_valence_p1 = patch.ring[1].face_valence; + const float face_valence_p2 = patch.ring[2].face_valence; + const float face_valence_p3 = patch.ring[3].face_valence; + + Vertex p0_r_p, p0_r_m; + patch.ring[0].computeGregoryPatchEdgePoints( p0(), e0_p(), e0_m(), p0_r_p, p0_r_m ); + + Vertex p1_r_p, p1_r_m; + patch.ring[1].computeGregoryPatchEdgePoints( p1(), e1_p(), e1_m(), p1_r_p, p1_r_m ); + + Vertex p2_r_p, p2_r_m; + patch.ring[2].computeGregoryPatchEdgePoints( p2(), e2_p(), e2_m(), p2_r_p, p2_r_m ); + + Vertex p3_r_p, p3_r_m; + patch.ring[3].computeGregoryPatchEdgePoints( p3(), e3_p(), e3_m(), p3_r_p, p3_r_m ); + + computeGregoryPatchFacePoints(face_valence_p0, p0_r_p, p0_r_m, p0(), e0_p(), e1_m(), face_valence_p1, e0_m(), e3_p(), face_valence_p3, f0_p(), f0_m() ); + computeGregoryPatchFacePoints(face_valence_p1, p1_r_p, p1_r_m, p1(), e1_p(), e2_m(), face_valence_p2, e1_m(), e0_p(), face_valence_p0, f1_p(), f1_m() ); + computeGregoryPatchFacePoints(face_valence_p2, p2_r_p, p2_r_m, p2(), e2_p(), e3_m(), face_valence_p3, e2_m(), e1_p(), face_valence_p1, f2_p(), f2_m() ); + computeGregoryPatchFacePoints(face_valence_p3, p3_r_p, p3_r_m, p3(), e3_p(), e0_m(), face_valence_p0, e3_m(), e2_p(), face_valence_p3, f3_p(), f3_m() ); + +#endif + } + + + __forceinline void convert_to_bezier() + { + f0_p() = (f0_p() + f0_m()) * 0.5f; + f1_p() = (f1_p() + f1_m()) * 0.5f; + f2_p() = (f2_p() + f2_m()) * 0.5f; + f3_p() = (f3_p() + f3_m()) * 0.5f; + f0_m() = Vertex( zero ); + f1_m() = Vertex( zero ); + f2_m() = Vertex( zero ); + f3_m() = Vertex( zero ); + } + + static __forceinline void computeInnerVertices(const Vertex matrix[4][4], const Vertex f_m[2][2], const float uu, const float vv, + Vertex_t& matrix_11, Vertex_t& matrix_12, Vertex_t& matrix_22, Vertex_t& matrix_21) + { + if (unlikely(uu == 0.0f || uu == 1.0f || vv == 0.0f || vv == 1.0f)) + { + matrix_11 = matrix[1][1]; + matrix_12 = matrix[1][2]; + matrix_22 = matrix[2][2]; + matrix_21 = matrix[2][1]; + } + else + { + const Vertex_t f0_p = matrix[1][1]; + const Vertex_t f1_p = matrix[1][2]; + const Vertex_t f2_p = matrix[2][2]; + const Vertex_t f3_p = matrix[2][1]; + + const Vertex_t f0_m = f_m[0][0]; + const Vertex_t f1_m = f_m[0][1]; + const Vertex_t f2_m = f_m[1][1]; + const Vertex_t f3_m = f_m[1][0]; + + matrix_11 = ( uu * f0_p + vv * f0_m)*rcp(uu+vv); + matrix_12 = ((1.0f-uu) * f1_m + vv * f1_p)*rcp(1.0f-uu+vv); + matrix_22 = ((1.0f-uu) * f2_p + (1.0f-vv) * f2_m)*rcp(2.0f-uu-vv); + matrix_21 = ( uu * f3_m + (1.0f-vv) * f3_p)*rcp(1.0f+uu-vv); + } + } + + template + static __forceinline void computeInnerVertices(const Vertex v[4][4], const Vertex f[2][2], + size_t i, const vfloat& uu, const vfloat& vv, vfloat& matrix_11, vfloat& matrix_12, vfloat& matrix_22, vfloat& matrix_21) + { + const auto m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f); + + const vfloat f0_p = v[1][1][i]; + const vfloat f1_p = v[1][2][i]; + const vfloat f2_p = v[2][2][i]; + const vfloat f3_p = v[2][1][i]; + + const vfloat f0_m = f[0][0][i]; + const vfloat f1_m = f[0][1][i]; + const vfloat f2_m = f[1][1][i]; + const vfloat f3_m = f[1][0][i]; + + const vfloat one_minus_uu = vfloat(1.0f) - uu; + const vfloat one_minus_vv = vfloat(1.0f) - vv; + + const vfloat f0_i = ( uu * f0_p + vv * f0_m) * rcp(uu+vv); + const vfloat f1_i = (one_minus_uu * f1_m + vv * f1_p) * rcp(one_minus_uu+vv); + const vfloat f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv); + const vfloat f3_i = ( uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv); + + matrix_11 = select(m_border,f0_p,f0_i); + matrix_12 = select(m_border,f1_p,f1_i); + matrix_22 = select(m_border,f2_p,f2_i); + matrix_21 = select(m_border,f3_p,f3_i); + } + + static __forceinline Vertex eval(const Vertex matrix[4][4], const Vertex f[2][2], const float& uu, const float& vv) + { + Vertex_t v_11, v_12, v_22, v_21; + computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); + + const Vec4 Bu = BezierBasis::eval(uu); + const Vec4 Bv = BezierBasis::eval(vv); + + return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), + madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), + madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), + Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); + } + + static __forceinline Vertex eval_du(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative + { + Vertex_t v_11, v_12, v_22, v_21; + computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); + + const Vec4 Bu = BezierBasis::derivative(uu); + const Vec4 Bv = BezierBasis::eval(vv); + + return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), + madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), + madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), + Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); + } + + static __forceinline Vertex eval_dv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative + { + Vertex_t v_11, v_12, v_22, v_21; + computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); + + const Vec4 Bu = BezierBasis::eval(uu); + const Vec4 Bv = BezierBasis::derivative(vv); + + return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), + madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), + madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), + Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); + } + + static __forceinline Vertex eval_dudu(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative + { + Vertex_t v_11, v_12, v_22, v_21; + computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); + + const Vec4 Bu = BezierBasis::derivative2(uu); + const Vec4 Bv = BezierBasis::eval(vv); + + return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), + madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), + madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), + Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); + } + + static __forceinline Vertex eval_dvdv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative + { + Vertex_t v_11, v_12, v_22, v_21; + computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); + + const Vec4 Bu = BezierBasis::eval(uu); + const Vec4 Bv = BezierBasis::derivative2(vv); + + return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), + madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), + madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), + Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); + } + + static __forceinline Vertex eval_dudv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative + { + Vertex_t v_11, v_12, v_22, v_21; + computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21); + + const Vec4 Bu = BezierBasis::derivative(uu); + const Vec4 Bv = BezierBasis::derivative(vv); + + return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), + madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11 ,madd(Bu.z,v_12 ,Bu.w * matrix[1][3]))), + madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21 ,madd(Bu.z,v_22 ,Bu.w * matrix[2][3]))), + Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); + } + + __forceinline Vertex eval(const float uu, const float vv) const { + return eval(v,f,uu,vv); + } + + __forceinline Vertex eval_du( const float uu, const float vv) const { + return eval_du(v,f,uu,vv); + } + + __forceinline Vertex eval_dv( const float uu, const float vv) const { + return eval_dv(v,f,uu,vv); + } + + __forceinline Vertex eval_dudu( const float uu, const float vv) const { + return eval_dudu(v,f,uu,vv); + } + + __forceinline Vertex eval_dvdv( const float uu, const float vv) const { + return eval_dvdv(v,f,uu,vv); + } + + __forceinline Vertex eval_dudv( const float uu, const float vv) const { + return eval_dudv(v,f,uu,vv); + } + + static __forceinline Vertex normal(const Vertex matrix[4][4], const Vertex f_m[2][2], const float uu, const float vv) // FIXME: why not using basis functions + { + /* interpolate inner vertices */ + Vertex_t matrix_11, matrix_12, matrix_22, matrix_21; + computeInnerVertices(matrix,f_m,uu,vv,matrix_11, matrix_12, matrix_22, matrix_21); + + /* tangentU */ + const Vertex_t col0 = deCasteljau(vv, (Vertex_t)matrix[0][0], (Vertex_t)matrix[1][0], (Vertex_t)matrix[2][0], (Vertex_t)matrix[3][0]); + const Vertex_t col1 = deCasteljau(vv, (Vertex_t)matrix[0][1], (Vertex_t)matrix_11 , (Vertex_t)matrix_21 , (Vertex_t)matrix[3][1]); + const Vertex_t col2 = deCasteljau(vv, (Vertex_t)matrix[0][2], (Vertex_t)matrix_12 , (Vertex_t)matrix_22 , (Vertex_t)matrix[3][2]); + const Vertex_t col3 = deCasteljau(vv, (Vertex_t)matrix[0][3], (Vertex_t)matrix[1][3], (Vertex_t)matrix[2][3], (Vertex_t)matrix[3][3]); + + const Vertex_t tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3); + + /* tangentV */ + const Vertex_t row0 = deCasteljau(uu, (Vertex_t)matrix[0][0], (Vertex_t)matrix[0][1], (Vertex_t)matrix[0][2], (Vertex_t)matrix[0][3]); + const Vertex_t row1 = deCasteljau(uu, (Vertex_t)matrix[1][0], (Vertex_t)matrix_11 , (Vertex_t)matrix_12 , (Vertex_t)matrix[1][3]); + const Vertex_t row2 = deCasteljau(uu, (Vertex_t)matrix[2][0], (Vertex_t)matrix_21 , (Vertex_t)matrix_22 , (Vertex_t)matrix[2][3]); + const Vertex_t row3 = deCasteljau(uu, (Vertex_t)matrix[3][0], (Vertex_t)matrix[3][1], (Vertex_t)matrix[3][2], (Vertex_t)matrix[3][3]); + + const Vertex_t tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3); + + /* normal = tangentU x tangentV */ + const Vertex_t n = cross(tangentU,tangentV); + + return n; + } + + __forceinline Vertex normal( const float uu, const float vv) const { + return normal(v,f,uu,vv); + } + + __forceinline void eval(const float u, const float v, + Vertex* P, Vertex* dPdu, Vertex* dPdv, + Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, + const float dscale = 1.0f) const + { + if (P) { + *P = eval(u,v); + } + if (dPdu) { + assert(dPdu); *dPdu = eval_du(u,v)*dscale; + assert(dPdv); *dPdv = eval_dv(u,v)*dscale; + } + if (ddPdudu) { + assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); + assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); + assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); + } + } + + template + static __forceinline vfloat eval(const Vertex v[4][4], const Vertex f[2][2], + const size_t i, const vfloat& uu, const vfloat& vv, const Vec4& u_n, const Vec4& v_n, + vfloat& matrix_11, vfloat& matrix_12, vfloat& matrix_22, vfloat& matrix_21) + { + const vfloat curve0_x = madd(v_n[0],vfloat(v[0][0][i]),madd(v_n[1],vfloat(v[1][0][i]),madd(v_n[2],vfloat(v[2][0][i]),v_n[3] * vfloat(v[3][0][i])))); + const vfloat curve1_x = madd(v_n[0],vfloat(v[0][1][i]),madd(v_n[1],vfloat(matrix_11 ),madd(v_n[2],vfloat(matrix_21 ),v_n[3] * vfloat(v[3][1][i])))); + const vfloat curve2_x = madd(v_n[0],vfloat(v[0][2][i]),madd(v_n[1],vfloat(matrix_12 ),madd(v_n[2],vfloat(matrix_22 ),v_n[3] * vfloat(v[3][2][i])))); + const vfloat curve3_x = madd(v_n[0],vfloat(v[0][3][i]),madd(v_n[1],vfloat(v[1][3][i]),madd(v_n[2],vfloat(v[2][3][i]),v_n[3] * vfloat(v[3][3][i])))); + return madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x))); + } + + template + static __forceinline void eval(const Vertex v[4][4], const Vertex f[2][2], + const vbool& valid, const vfloat& uu, const vfloat& vv, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, + const float dscale, const size_t dstride, const size_t N) + { + if (P) { + const Vec4 u_n = BezierBasis::eval(uu); + const Vec4 v_n = BezierBasis::eval(vv); + for (size_t i=0; i u_n = BezierBasis::derivative(uu); + const Vec4 v_n = BezierBasis::eval(vv); + for (size_t i=0; i u_n = BezierBasis::eval(uu); + const Vec4 v_n = BezierBasis::derivative(vv); + for (size_t i=0; i u_n = BezierBasis::derivative2(uu); + const Vec4 v_n = BezierBasis::eval(vv); + for (size_t i=0; i u_n = BezierBasis::eval(uu); + const Vec4 v_n = BezierBasis::derivative2(vv); + for (size_t i=0; i u_n = BezierBasis::derivative(uu); + const Vec4 v_n = BezierBasis::derivative(vv); + for (size_t i=0; i + __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, + const float dscale, const size_t dstride, const size_t N) const { + eval(v,f,valid,uu,vv,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + } + + template + static __forceinline Vec3 eval_t(const Vertex matrix[4][4], const Vec3 f[2][2], const T& uu, const T& vv) + { + typedef typename T::Bool M; + const M m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f); + + const Vec3 f0_p = Vec3(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z); + const Vec3 f1_p = Vec3(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z); + const Vec3 f2_p = Vec3(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z); + const Vec3 f3_p = Vec3(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z); + + const Vec3 f0_m = f[0][0]; + const Vec3 f1_m = f[0][1]; + const Vec3 f2_m = f[1][1]; + const Vec3 f3_m = f[1][0]; + + const T one_minus_uu = T(1.0f) - uu; + const T one_minus_vv = T(1.0f) - vv; + + const Vec3 f0_i = ( uu * f0_p + vv * f0_m) * rcp(uu+vv); + const Vec3 f1_i = (one_minus_uu * f1_m + vv * f1_p) * rcp(one_minus_uu+vv); + const Vec3 f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv); + const Vec3 f3_i = ( uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv); + + const Vec3 F0( select(m_border,f0_p.x,f0_i.x), select(m_border,f0_p.y,f0_i.y), select(m_border,f0_p.z,f0_i.z) ); + const Vec3 F1( select(m_border,f1_p.x,f1_i.x), select(m_border,f1_p.y,f1_i.y), select(m_border,f1_p.z,f1_i.z) ); + const Vec3 F2( select(m_border,f2_p.x,f2_i.x), select(m_border,f2_p.y,f2_i.y), select(m_border,f2_p.z,f2_i.z) ); + const Vec3 F3( select(m_border,f3_p.x,f3_i.x), select(m_border,f3_p.y,f3_i.y), select(m_border,f3_p.z,f3_i.z) ); + + const T B0_u = one_minus_uu * one_minus_uu * one_minus_uu; + const T B0_v = one_minus_vv * one_minus_vv * one_minus_vv; + const T B1_u = 3.0f * (one_minus_uu * uu * one_minus_uu); + const T B1_v = 3.0f * (one_minus_vv * vv * one_minus_vv); + const T B2_u = 3.0f * (uu * one_minus_uu * uu); + const T B2_v = 3.0f * (vv * one_minus_vv * vv); + const T B3_u = uu * uu * uu; + const T B3_v = vv * vv * vv; + + const T x = madd(B0_v,madd(B0_u,matrix[0][0].x,madd(B1_u,matrix[0][1].x,madd(B2_u,matrix[0][2].x,B3_u * matrix[0][3].x))), + madd(B1_v,madd(B0_u,matrix[1][0].x,madd(B1_u,F0.x ,madd(B2_u,F1.x ,B3_u * matrix[1][3].x))), + madd(B2_v,madd(B0_u,matrix[2][0].x,madd(B1_u,F3.x ,madd(B2_u,F2.x ,B3_u * matrix[2][3].x))), + B3_v*madd(B0_u,matrix[3][0].x,madd(B1_u,matrix[3][1].x,madd(B2_u,matrix[3][2].x,B3_u * matrix[3][3].x)))))); + + const T y = madd(B0_v,madd(B0_u,matrix[0][0].y,madd(B1_u,matrix[0][1].y,madd(B2_u,matrix[0][2].y,B3_u * matrix[0][3].y))), + madd(B1_v,madd(B0_u,matrix[1][0].y,madd(B1_u,F0.y ,madd(B2_u,F1.y ,B3_u * matrix[1][3].y))), + madd(B2_v,madd(B0_u,matrix[2][0].y,madd(B1_u,F3.y ,madd(B2_u,F2.y ,B3_u * matrix[2][3].y))), + B3_v*madd(B0_u,matrix[3][0].y,madd(B1_u,matrix[3][1].y,madd(B2_u,matrix[3][2].y,B3_u * matrix[3][3].y)))))); + + const T z = madd(B0_v,madd(B0_u,matrix[0][0].z,madd(B1_u,matrix[0][1].z,madd(B2_u,matrix[0][2].z,B3_u * matrix[0][3].z))), + madd(B1_v,madd(B0_u,matrix[1][0].z,madd(B1_u,F0.z ,madd(B2_u,F1.z ,B3_u * matrix[1][3].z))), + madd(B2_v,madd(B0_u,matrix[2][0].z,madd(B1_u,F3.z ,madd(B2_u,F2.z ,B3_u * matrix[2][3].z))), + B3_v*madd(B0_u,matrix[3][0].z,madd(B1_u,matrix[3][1].z,madd(B2_u,matrix[3][2].z,B3_u * matrix[3][3].z)))))); + + return Vec3(x,y,z); + } + + template + __forceinline Vec3 eval(const T& uu, const T& vv) const + { + Vec3 ff[2][2]; + ff[0][0] = Vec3(f[0][0]); + ff[0][1] = Vec3(f[0][1]); + ff[1][1] = Vec3(f[1][1]); + ff[1][0] = Vec3(f[1][0]); + return eval_t(v,ff,uu,vv); + } + + template + static __forceinline Vec3 normal_t(const Vertex matrix[4][4], const Vec3 f[2][2], const T& uu, const T& vv) + { + typedef typename T::Bool M; + + const Vec3 f0_p = Vec3(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z); + const Vec3 f1_p = Vec3(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z); + const Vec3 f2_p = Vec3(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z); + const Vec3 f3_p = Vec3(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z); + + const Vec3 f0_m = f[0][0]; + const Vec3 f1_m = f[0][1]; + const Vec3 f2_m = f[1][1]; + const Vec3 f3_m = f[1][0]; + + const T one_minus_uu = T(1.0f) - uu; + const T one_minus_vv = T(1.0f) - vv; + + const Vec3 f0_i = ( uu * f0_p + vv * f0_m) * rcp(uu+vv); + const Vec3 f1_i = (one_minus_uu * f1_m + vv * f1_p) * rcp(one_minus_uu+vv); + const Vec3 f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv); + const Vec3 f3_i = ( uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv); + +#if 1 + const M m_corner0 = (uu == 0.0f) & (vv == 0.0f); + const M m_corner1 = (uu == 1.0f) & (vv == 0.0f); + const M m_corner2 = (uu == 1.0f) & (vv == 1.0f); + const M m_corner3 = (uu == 0.0f) & (vv == 1.0f); + const Vec3 matrix_11( select(m_corner0,f0_p.x,f0_i.x), select(m_corner0,f0_p.y,f0_i.y), select(m_corner0,f0_p.z,f0_i.z) ); + const Vec3 matrix_12( select(m_corner1,f1_p.x,f1_i.x), select(m_corner1,f1_p.y,f1_i.y), select(m_corner1,f1_p.z,f1_i.z) ); + const Vec3 matrix_22( select(m_corner2,f2_p.x,f2_i.x), select(m_corner2,f2_p.y,f2_i.y), select(m_corner2,f2_p.z,f2_i.z) ); + const Vec3 matrix_21( select(m_corner3,f3_p.x,f3_i.x), select(m_corner3,f3_p.y,f3_i.y), select(m_corner3,f3_p.z,f3_i.z) ); +#else + const M m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f); + const Vec3 matrix_11( select(m_border,f0_p.x,f0_i.x), select(m_border,f0_p.y,f0_i.y), select(m_border,f0_p.z,f0_i.z) ); + const Vec3 matrix_12( select(m_border,f1_p.x,f1_i.x), select(m_border,f1_p.y,f1_i.y), select(m_border,f1_p.z,f1_i.z) ); + const Vec3 matrix_22( select(m_border,f2_p.x,f2_i.x), select(m_border,f2_p.y,f2_i.y), select(m_border,f2_p.z,f2_i.z) ); + const Vec3 matrix_21( select(m_border,f3_p.x,f3_i.x), select(m_border,f3_p.y,f3_i.y), select(m_border,f3_p.z,f3_i.z) ); +#endif + + const Vec3 matrix_00 = Vec3(matrix[0][0].x,matrix[0][0].y,matrix[0][0].z); + const Vec3 matrix_10 = Vec3(matrix[1][0].x,matrix[1][0].y,matrix[1][0].z); + const Vec3 matrix_20 = Vec3(matrix[2][0].x,matrix[2][0].y,matrix[2][0].z); + const Vec3 matrix_30 = Vec3(matrix[3][0].x,matrix[3][0].y,matrix[3][0].z); + + const Vec3 matrix_01 = Vec3(matrix[0][1].x,matrix[0][1].y,matrix[0][1].z); + const Vec3 matrix_02 = Vec3(matrix[0][2].x,matrix[0][2].y,matrix[0][2].z); + const Vec3 matrix_03 = Vec3(matrix[0][3].x,matrix[0][3].y,matrix[0][3].z); + + const Vec3 matrix_31 = Vec3(matrix[3][1].x,matrix[3][1].y,matrix[3][1].z); + const Vec3 matrix_32 = Vec3(matrix[3][2].x,matrix[3][2].y,matrix[3][2].z); + const Vec3 matrix_33 = Vec3(matrix[3][3].x,matrix[3][3].y,matrix[3][3].z); + + const Vec3 matrix_13 = Vec3(matrix[1][3].x,matrix[1][3].y,matrix[1][3].z); + const Vec3 matrix_23 = Vec3(matrix[2][3].x,matrix[2][3].y,matrix[2][3].z); + + /* tangentU */ + const Vec3 col0 = deCasteljau(vv, matrix_00, matrix_10, matrix_20, matrix_30); + const Vec3 col1 = deCasteljau(vv, matrix_01, matrix_11, matrix_21, matrix_31); + const Vec3 col2 = deCasteljau(vv, matrix_02, matrix_12, matrix_22, matrix_32); + const Vec3 col3 = deCasteljau(vv, matrix_03, matrix_13, matrix_23, matrix_33); + + const Vec3 tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3); + + /* tangentV */ + const Vec3 row0 = deCasteljau(uu, matrix_00, matrix_01, matrix_02, matrix_03); + const Vec3 row1 = deCasteljau(uu, matrix_10, matrix_11, matrix_12, matrix_13); + const Vec3 row2 = deCasteljau(uu, matrix_20, matrix_21, matrix_22, matrix_23); + const Vec3 row3 = deCasteljau(uu, matrix_30, matrix_31, matrix_32, matrix_33); + + const Vec3 tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3); + + /* normal = tangentU x tangentV */ + const Vec3 n = cross(tangentU,tangentV); + return n; + } + + template + __forceinline Vec3 normal(const T& uu, const T& vv) const + { + Vec3 ff[2][2]; + ff[0][0] = Vec3(f[0][0]); + ff[0][1] = Vec3(f[0][1]); + ff[1][1] = Vec3(f[1][1]); + ff[1][0] = Vec3(f[1][0]); + return normal_t(v,ff,uu,vv); + } + + __forceinline BBox bounds() const + { + const Vertex *const cv = &v[0][0]; + BBox bounds (cv[0]); + for (size_t i=1; i<16; i++) + bounds.extend( cv[i] ); + bounds.extend(f[0][0]); + bounds.extend(f[1][0]); + bounds.extend(f[1][1]); + bounds.extend(f[1][1]); + return bounds; + } + + friend embree_ostream operator<<(embree_ostream o, const GregoryPatchT& p) + { + for (size_t y=0; y<4; y++) + for (size_t x=0; x<4; x++) + o << "v[" << y << "][" << x << "] " << p.v[y][x] << embree_endl; + + for (size_t y=0; y<2; y++) + for (size_t x=0; x<2; x++) + o << "f[" << y << "][" << x << "] " << p.f[y][x] << embree_endl; + return o; + } + }; + + typedef GregoryPatchT GregoryPatch3fa; + + template + __forceinline BezierPatchT::BezierPatchT (const HalfEdge* edge, const char* vertices, size_t stride) + { + CatmullClarkPatchT patch(edge,vertices,stride); + GregoryPatchT gpatch(patch); + gpatch.convert_to_bezier(); + for (size_t y=0; y<4; y++) + for (size_t x=0; x<4; x++) + matrix[y][x] = (Vertex_t)gpatch.v[y][x]; + } + + template + __forceinline BezierPatchT::BezierPatchT(const CatmullClarkPatchT& patch) + { + GregoryPatchT gpatch(patch); + gpatch.convert_to_bezier(); + for (size_t y=0; y<4; y++) + for (size_t x=0; x<4; x++) + matrix[y][x] = (Vertex_t)gpatch.v[y][x]; + } + + template + __forceinline BezierPatchT::BezierPatchT(const CatmullClarkPatchT& patch, + const BezierCurveT* border0, + const BezierCurveT* border1, + const BezierCurveT* border2, + const BezierCurveT* border3) + { + GregoryPatchT gpatch(patch,border0,border1,border2,border3); + gpatch.convert_to_bezier(); + for (size_t y=0; y<4; y++) + for (size_t x=0; x<4; x++) + matrix[y][x] = (Vertex_t)gpatch.v[y][x]; + } +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch_dense.h b/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch_dense.h new file mode 100644 index 00000000000..85effd02cfb --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch_dense.h @@ -0,0 +1,113 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "gregory_patch.h" + +namespace embree +{ + class __aligned(64) DenseGregoryPatch3fa + { + typedef Vec3fa Vec3fa_4x4[4][4]; + public: + + __forceinline DenseGregoryPatch3fa (const GregoryPatch3fa& patch) + { + for (size_t y=0; y<4; y++) + for (size_t x=0; x<4; x++) + matrix[y][x] = Vec3ff(patch.v[y][x], 0.0f); + + matrix[0][0].w = patch.f[0][0].x; + matrix[0][1].w = patch.f[0][0].y; + matrix[0][2].w = patch.f[0][0].z; + matrix[0][3].w = 0.0f; + + matrix[1][0].w = patch.f[0][1].x; + matrix[1][1].w = patch.f[0][1].y; + matrix[1][2].w = patch.f[0][1].z; + matrix[1][3].w = 0.0f; + + matrix[2][0].w = patch.f[1][1].x; + matrix[2][1].w = patch.f[1][1].y; + matrix[2][2].w = patch.f[1][1].z; + matrix[2][3].w = 0.0f; + + matrix[3][0].w = patch.f[1][0].x; + matrix[3][1].w = patch.f[1][0].y; + matrix[3][2].w = patch.f[1][0].z; + matrix[3][3].w = 0.0f; + } + + __forceinline void extract_f_m(Vec3fa f_m[2][2]) const + { + f_m[0][0] = Vec3fa( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w ); + f_m[0][1] = Vec3fa( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w ); + f_m[1][1] = Vec3fa( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w ); + f_m[1][0] = Vec3fa( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w ); + } + + __forceinline Vec3fa eval(const float uu, const float vv) const + { + __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m); + return GregoryPatch3fa::eval(*(Vec3fa_4x4*)&matrix,f_m,uu,vv); + } + + __forceinline Vec3fa normal(const float uu, const float vv) const + { + __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m); + return GregoryPatch3fa::normal(*(Vec3fa_4x4*)&matrix,f_m,uu,vv); + } + + template + __forceinline Vec3 eval(const T &uu, const T &vv) const + { + Vec3 f_m[2][2]; + f_m[0][0] = Vec3( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w ); + f_m[0][1] = Vec3( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w ); + f_m[1][1] = Vec3( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w ); + f_m[1][0] = Vec3( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w ); + return GregoryPatch3fa::eval_t(*(Vec3fa_4x4*)&matrix,f_m,uu,vv); + } + + template + __forceinline Vec3 normal(const T &uu, const T &vv) const + { + Vec3 f_m[2][2]; + f_m[0][0] = Vec3( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w ); + f_m[0][1] = Vec3( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w ); + f_m[1][1] = Vec3( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w ); + f_m[1][0] = Vec3( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w ); + return GregoryPatch3fa::normal_t(*(Vec3fa_4x4*)&matrix,f_m,uu,vv); + } + + __forceinline void eval(const float u, const float v, + Vec3fa* P, Vec3fa* dPdu, Vec3fa* dPdv, Vec3fa* ddPdudu, Vec3fa* ddPdvdv, Vec3fa* ddPdudv, + const float dscale = 1.0f) const + { + __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m); + if (P) { + *P = GregoryPatch3fa::eval(*(Vec3fa_4x4*)&matrix,f_m,u,v); + } + if (dPdu) { + assert(dPdu); *dPdu = GregoryPatch3fa::eval_du(*(Vec3fa_4x4*)&matrix,f_m,u,v)*dscale; + assert(dPdv); *dPdv = GregoryPatch3fa::eval_dv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*dscale; + } + if (ddPdudu) { + assert(ddPdudu); *ddPdudu = GregoryPatch3fa::eval_dudu(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); + assert(ddPdvdv); *ddPdvdv = GregoryPatch3fa::eval_dvdv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); + assert(ddPdudv); *ddPdudv = GregoryPatch3fa::eval_dudv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); + } + } + + template + __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, float* P, float* dPdu, float* dPdv, const float dscale, const size_t dstride, const size_t N) const + { + __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m); + GregoryPatch3fa::eval(matrix,f_m,valid,uu,vv,P,dPdu,dPdv,dscale,dstride,N); + } + + private: + Vec3ff matrix[4][4]; // f_p/m points are stored in 4th component + }; +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/gridrange.h b/thirdparty/embree-aarch64/kernels/subdiv/gridrange.h new file mode 100644 index 00000000000..4fd741c8790 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/gridrange.h @@ -0,0 +1,96 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" + +namespace embree +{ + struct __aligned(16) GridRange + { + unsigned int u_start; + unsigned int u_end; + unsigned int v_start; + unsigned int v_end; + + __forceinline GridRange() {} + + __forceinline GridRange(unsigned int u_start, unsigned int u_end, unsigned int v_start, unsigned int v_end) + : u_start(u_start), u_end(u_end), v_start(v_start), v_end(v_end) {} + + __forceinline unsigned int width() const { + return u_end-u_start+1; + } + + __forceinline unsigned int height() const { + return v_end-v_start+1; + } + + __forceinline bool hasLeafSize() const + { + const unsigned int u_size = u_end-u_start+1; + const unsigned int v_size = v_end-v_start+1; + assert(u_size >= 1); + assert(v_size >= 1); + return u_size <= 3 && v_size <= 3; + } + + static __forceinline unsigned int split(unsigned int start,unsigned int end) + { + const unsigned int center = (start+end)/2; + assert (center > start); + assert (center < end); + return center; + } + + __forceinline void split(GridRange& r0, GridRange& r1) const + { + assert( hasLeafSize() == false ); + const unsigned int u_size = u_end-u_start+1; + const unsigned int v_size = v_end-v_start+1; + r0 = *this; + r1 = *this; + + if (u_size >= v_size) + { + const unsigned int u_mid = split(u_start,u_end); + r0.u_end = u_mid; + r1.u_start = u_mid; + } + else + { + const unsigned int v_mid = split(v_start,v_end); + r0.v_end = v_mid; + r1.v_start = v_mid; + } + } + + __forceinline unsigned int splitIntoSubRanges(GridRange r[4]) const + { + assert( !hasLeafSize() ); + unsigned int children = 0; + GridRange first,second; + split(first,second); + + if (first.hasLeafSize()) { + r[0] = first; + children++; + } + else { + first.split(r[0],r[1]); + children += 2; + } + + if (second.hasLeafSize()) { + r[children] = second; + children++; + } + else { + second.split(r[children+0],r[children+1]); + children += 2; + } + return children; + } + }; +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/half_edge.h b/thirdparty/embree-aarch64/kernels/subdiv/half_edge.h new file mode 100644 index 00000000000..fb350ca71fe --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/half_edge.h @@ -0,0 +1,371 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "catmullclark_coefficients.h" + +namespace embree +{ + class __aligned(32) HalfEdge + { + friend class SubdivMesh; + public: + + enum PatchType : char { + BILINEAR_PATCH = 0, //!< a bilinear patch + REGULAR_QUAD_PATCH = 1, //!< a regular quad patch can be represented as a B-Spline + IRREGULAR_QUAD_PATCH = 2, //!< an irregular quad patch can be represented as a Gregory patch + COMPLEX_PATCH = 3 //!< these patches need subdivision and cannot be processed by the above fast code paths + }; + + enum VertexType : char { + REGULAR_VERTEX = 0, //!< regular vertex + NON_MANIFOLD_EDGE_VERTEX = 1, //!< vertex of a non-manifold edge + }; + + __forceinline friend PatchType max( const PatchType& ty0, const PatchType& ty1) { + return (PatchType) max((int)ty0,(int)ty1); + } + + struct Edge + { + /*! edge constructor */ + __forceinline Edge(const uint32_t v0, const uint32_t v1) + : v0(v0), v1(v1) {} + + /*! create an 64 bit identifier that is unique for the not oriented edge */ + __forceinline operator uint64_t() const + { + uint32_t p0 = v0, p1 = v1; + if (p0next(); } + __forceinline const HalfEdge* rotate() const { return opposite()->next(); } + + __forceinline unsigned int getStartVertexIndex() const { return vtx_index; } + __forceinline unsigned int getEndVertexIndex () const { return next()->vtx_index; } + __forceinline Edge getEdge () const { return Edge(getStartVertexIndex(),getEndVertexIndex()); } + + + /*! tests if the start vertex of the edge is regular */ + __forceinline PatchType vertexType() const + { + const HalfEdge* p = this; + size_t face_valence = 0; + bool hasBorder = false; + + do + { + /* we need subdivision to handle edge creases */ + if (p->hasOpposite() && p->edge_crease_weight > 0.0f) + return COMPLEX_PATCH; + + face_valence++; + + /* test for quad */ + const HalfEdge* pp = p; + pp = pp->next(); if (pp == p) return COMPLEX_PATCH; + pp = pp->next(); if (pp == p) return COMPLEX_PATCH; + pp = pp->next(); if (pp == p) return COMPLEX_PATCH; + pp = pp->next(); if (pp != p) return COMPLEX_PATCH; + + /* continue with next face */ + p = p->prev(); + if (likely(p->hasOpposite())) + p = p->opposite(); + + /* if there is no opposite go the long way to the other side of the border */ + else + { + face_valence++; + hasBorder = true; + p = this; + while (p->hasOpposite()) + p = p->rotate(); + } + } while (p != this); + + /* calculate vertex type */ + if (face_valence == 2 && hasBorder) { + if (vertex_crease_weight == 0.0f ) return REGULAR_QUAD_PATCH; + else if (vertex_crease_weight == float(inf)) return REGULAR_QUAD_PATCH; + else return COMPLEX_PATCH; + } + else if (vertex_crease_weight != 0.0f) return COMPLEX_PATCH; + else if (face_valence == 3 && hasBorder) return REGULAR_QUAD_PATCH; + else if (face_valence == 4 && !hasBorder) return REGULAR_QUAD_PATCH; + else return IRREGULAR_QUAD_PATCH; + } + + /*! tests if this edge is part of a bilinear patch */ + __forceinline bool bilinearVertex() const { + return vertex_crease_weight == float(inf) && edge_crease_weight == float(inf); + } + + /*! calculates the type of the patch */ + __forceinline PatchType patchType() const + { + const HalfEdge* p = this; + PatchType ret = REGULAR_QUAD_PATCH; + bool bilinear = true; + + ret = max(ret,p->vertexType()); + bilinear &= p->bilinearVertex(); + if ((p = p->next()) == this) return COMPLEX_PATCH; + + ret = max(ret,p->vertexType()); + bilinear &= p->bilinearVertex(); + if ((p = p->next()) == this) return COMPLEX_PATCH; + + ret = max(ret,p->vertexType()); + bilinear &= p->bilinearVertex(); + if ((p = p->next()) == this) return COMPLEX_PATCH; + + ret = max(ret,p->vertexType()); + bilinear &= p->bilinearVertex(); + if ((p = p->next()) != this) return COMPLEX_PATCH; + + if (bilinear) return BILINEAR_PATCH; + return ret; + } + + /*! tests if the face is a regular b-spline face */ + __forceinline bool isRegularFace() const { + return patch_type == REGULAR_QUAD_PATCH; + } + + /*! tests if the face can be diced (using bspline or gregory patch) */ + __forceinline bool isGregoryFace() const { + return patch_type == IRREGULAR_QUAD_PATCH || patch_type == REGULAR_QUAD_PATCH; + } + + /*! tests if the base vertex of this half edge is a corner vertex */ + __forceinline bool isCorner() const { + return !hasOpposite() && !prev()->hasOpposite(); + } + + /*! tests if the vertex is attached to any border */ + __forceinline bool vertexHasBorder() const + { + const HalfEdge* p = this; + do { + if (!p->hasOpposite()) return true; + p = p->rotate(); + } while (p != this); + return false; + } + + /*! tests if the face this half edge belongs to has some border */ + __forceinline bool faceHasBorder() const + { + const HalfEdge* p = this; + do { + if (p->vertexHasBorder()) return true; + p = p->next(); + } while (p != this); + return false; + } + + /*! calculates conservative bounds of a catmull clark subdivision face */ + __forceinline BBox3fa bounds(const BufferView& vertices) const + { + BBox3fa bounds = this->get1RingBounds(vertices); + for (const HalfEdge* p=this->next(); p!=this; p=p->next()) + bounds.extend(p->get1RingBounds(vertices)); + return bounds; + } + + /*! tests if this is a valid patch */ + __forceinline bool valid(const BufferView& vertices) const + { + size_t N = 1; + if (!this->validRing(vertices)) return false; + for (const HalfEdge* p=this->next(); p!=this; p=p->next(), N++) { + if (!p->validRing(vertices)) return false; + } + return N >= 3 && N <= MAX_PATCH_VALENCE; + } + + /*! counts number of polygon edges */ + __forceinline unsigned int numEdges() const + { + unsigned int N = 1; + for (const HalfEdge* p=this->next(); p!=this; p=p->next(), N++); + return N; + } + + /*! calculates face and edge valence */ + __forceinline void calculateFaceValenceAndEdgeValence(size_t& faceValence, size_t& edgeValence) const + { + faceValence = 0; + edgeValence = 0; + + const HalfEdge* p = this; + do + { + /* calculate bounds of current face */ + unsigned int numEdges = p->numEdges(); + assert(numEdges >= 3); + edgeValence += numEdges-2; + + faceValence++; + p = p->prev(); + + /* continue with next face */ + if (likely(p->hasOpposite())) + p = p->opposite(); + + /* if there is no opposite go the long way to the other side of the border */ + else { + faceValence++; + edgeValence++; + p = this; + while (p->hasOpposite()) + p = p->opposite()->next(); + } + + } while (p != this); + } + + /*! stream output */ + friend __forceinline std::ostream &operator<<(std::ostream &o, const HalfEdge &h) + { + return o << "{ " << + "vertex = " << h.vtx_index << ", " << //" -> " << h.next()->vtx_index << ", " << + "prev = " << h.prev_half_edge_ofs << ", " << + "next = " << h.next_half_edge_ofs << ", " << + "opposite = " << h.opposite_half_edge_ofs << ", " << + "edge_crease = " << h.edge_crease_weight << ", " << + "vertex_crease = " << h.vertex_crease_weight << ", " << + //"edge_level = " << h.edge_level << + " }"; + } + + private: + + /*! calculates the bounds of the face associated with the half-edge */ + __forceinline BBox3fa getFaceBounds(const BufferView& vertices) const + { + BBox3fa b = vertices[getStartVertexIndex()]; + for (const HalfEdge* p = next(); p!=this; p=p->next()) { + b.extend(vertices[p->getStartVertexIndex()]); + } + return b; + } + + /*! calculates the bounds of the 1-ring associated with the vertex of the half-edge */ + __forceinline BBox3fa get1RingBounds(const BufferView& vertices) const + { + BBox3fa bounds = empty; + const HalfEdge* p = this; + do + { + /* calculate bounds of current face */ + bounds.extend(p->getFaceBounds(vertices)); + p = p->prev(); + + /* continue with next face */ + if (likely(p->hasOpposite())) + p = p->opposite(); + + /* if there is no opposite go the long way to the other side of the border */ + else { + p = this; + while (p->hasOpposite()) + p = p->opposite()->next(); + } + + } while (p != this); + + return bounds; + } + + /*! tests if this is a valid face */ + __forceinline bool validFace(const BufferView& vertices, size_t& N) const + { + const Vec3fa v = vertices[getStartVertexIndex()]; + if (!isvalid(v)) return false; + size_t n = 1; + for (const HalfEdge* p = next(); p!=this; p=p->next(), n++) { + const Vec3fa v = vertices[p->getStartVertexIndex()]; + if (!isvalid(v)) return false; + } + N += n-2; + return n >= 3 && n <= MAX_PATCH_VALENCE; + } + + /*! tests if this is a valid ring */ + __forceinline bool validRing(const BufferView& vertices) const + { + size_t faceValence = 0; + size_t edgeValence = 0; + + const HalfEdge* p = this; + do + { + /* calculate bounds of current face */ + if (!p->validFace(vertices,edgeValence)) + return false; + + faceValence++; + p = p->prev(); + + /* continue with next face */ + if (likely(p->hasOpposite())) + p = p->opposite(); + + /* if there is no opposite go the long way to the other side of the border */ + else { + faceValence++; + edgeValence++; + p = this; + while (p->hasOpposite()) + p = p->opposite()->next(); + } + + } while (p != this); + + return faceValence <= MAX_RING_FACE_VALENCE && edgeValence <= MAX_RING_EDGE_VALENCE; + } + + private: + unsigned int vtx_index; //!< index of edge start vertex + int next_half_edge_ofs; //!< relative offset to next half edge of face + int prev_half_edge_ofs; //!< relative offset to previous half edge of face + int opposite_half_edge_ofs; //!< relative offset to opposite half edge + + public: + float edge_crease_weight; //!< crease weight attached to edge + float vertex_crease_weight; //!< crease weight attached to start vertex + float edge_level; //!< subdivision factor for edge + PatchType patch_type; //!< stores type of subdiv patch + VertexType vertex_type; //!< stores type of the start vertex + char align[2]; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/hermite_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/hermite_curve.h new file mode 100644 index 00000000000..9fab79cf0c7 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/hermite_curve.h @@ -0,0 +1,38 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "bezier_curve.h" + +namespace embree +{ + template + struct HermiteCurveT : BezierCurveT + { + __forceinline HermiteCurveT() {} + + __forceinline HermiteCurveT(const BezierCurveT& curve) + : BezierCurveT(curve) {} + + __forceinline HermiteCurveT(const Vertex& v0, const Vertex& t0, const Vertex& v1, const Vertex& t1) + : BezierCurveT(v0,madd(1.0f/3.0f,t0,v0),nmadd(1.0f/3.0f,t1,v1),v1) {} + + __forceinline HermiteCurveT xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const + { + const Vec3ff q0(xfmVector(space,this->v0-p), this->v0.w); + const Vec3ff q1(xfmVector(space,this->v1-p), this->v1.w); + const Vec3ff q2(xfmVector(space,this->v2-p), this->v2.w); + const Vec3ff q3(xfmVector(space,this->v3-p), this->v3.w); + return BezierCurveT(q0,q1,q2,q3); + } + }; + + __forceinline HermiteCurveT enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const HermiteCurveT& curve) { + return HermiteCurveT(enlargeRadiusToMinWidth(context,geom,ray_org,BezierCurveT(curve))); + } + + typedef HermiteCurveT HermiteCurve3fa; +} + diff --git a/thirdparty/embree-aarch64/kernels/subdiv/linear_bezier_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/linear_bezier_patch.h new file mode 100644 index 00000000000..f4a854af7f3 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/linear_bezier_patch.h @@ -0,0 +1,403 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bezier_curve.h" + +namespace embree +{ + namespace isa + { + template + struct TensorLinearQuadraticBezierSurface + { + QuadraticBezierCurve L; + QuadraticBezierCurve R; + + __forceinline TensorLinearQuadraticBezierSurface() {} + + __forceinline TensorLinearQuadraticBezierSurface(const TensorLinearQuadraticBezierSurface& curve) + : L(curve.L), R(curve.R) {} + + __forceinline TensorLinearQuadraticBezierSurface& operator= (const TensorLinearQuadraticBezierSurface& other) { + L = other.L; R = other.R; return *this; + } + + __forceinline TensorLinearQuadraticBezierSurface(const QuadraticBezierCurve& L, const QuadraticBezierCurve& R) + : L(L), R(R) {} + + __forceinline BBox bounds() const { + return merge(L.bounds(),R.bounds()); + } + }; + + template<> + struct TensorLinearQuadraticBezierSurface + { + QuadraticBezierCurve LR; + + __forceinline TensorLinearQuadraticBezierSurface() {} + + __forceinline TensorLinearQuadraticBezierSurface(const TensorLinearQuadraticBezierSurface& curve) + : LR(curve.LR) {} + + __forceinline TensorLinearQuadraticBezierSurface& operator= (const TensorLinearQuadraticBezierSurface& other) { + LR = other.LR; return *this; + } + + __forceinline TensorLinearQuadraticBezierSurface(const QuadraticBezierCurve& LR) + : LR(LR) {} + + __forceinline BBox bounds() const + { + const BBox b = LR.bounds(); + const BBox bl(Vec2fa(b.lower),Vec2fa(b.upper)); + const BBox br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper))); + return merge(bl,br); + } + }; + + template + struct TensorLinearCubicBezierSurface + { + CubicBezierCurve L; + CubicBezierCurve R; + + __forceinline TensorLinearCubicBezierSurface() {} + + __forceinline TensorLinearCubicBezierSurface(const TensorLinearCubicBezierSurface& curve) + : L(curve.L), R(curve.R) {} + + __forceinline TensorLinearCubicBezierSurface& operator= (const TensorLinearCubicBezierSurface& other) { + L = other.L; R = other.R; return *this; + } + + __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve& L, const CubicBezierCurve& R) + : L(L), R(R) {} + + template class SourceCurve> + __forceinline static TensorLinearCubicBezierSurface fromCenterAndNormalCurve(const SourceCurve& center, const SourceCurve& normal) + { + SourceCurve vcurve = center; + SourceCurve ncurve = normal; + + /* here we construct a patch which follows the curve l(t) = + * p(t) +/- r(t)*normalize(cross(n(t),dp(t))) */ + + const Vec3ff p0 = vcurve.eval(0.0f); + const Vec3ff dp0 = vcurve.eval_du(0.0f); + const Vec3ff ddp0 = vcurve.eval_dudu(0.0f); + + const Vec3fa n0 = ncurve.eval(0.0f); + const Vec3fa dn0 = ncurve.eval_du(0.0f); + + const Vec3ff p1 = vcurve.eval(1.0f); + const Vec3ff dp1 = vcurve.eval_du(1.0f); + const Vec3ff ddp1 = vcurve.eval_dudu(1.0f); + + const Vec3fa n1 = ncurve.eval(1.0f); + const Vec3fa dn1 = ncurve.eval_du(1.0f); + + const Vec3fa bt0 = cross(n0,dp0); + const Vec3fa dbt0 = cross(dn0,dp0) + cross(n0,ddp0); + + const Vec3fa bt1 = cross(n1,dp1); + const Vec3fa dbt1 = cross(dn1,dp1) + cross(n1,ddp1); + + const Vec3fa k0 = normalize(bt0); + const Vec3fa dk0 = dnormalize(bt0,dbt0); + + const Vec3fa k1 = normalize(bt1); + const Vec3fa dk1 = dnormalize(bt1,dbt1); + + const Vec3fa l0 = p0 - p0.w*k0; + const Vec3fa dl0 = dp0 - (dp0.w*k0 + p0.w*dk0); + + const Vec3fa r0 = p0 + p0.w*k0; + const Vec3fa dr0 = dp0 + (dp0.w*k0 + p0.w*dk0); + + const Vec3fa l1 = p1 - p1.w*k1; + const Vec3fa dl1 = dp1 - (dp1.w*k1 + p1.w*dk1); + + const Vec3fa r1 = p1 + p1.w*k1; + const Vec3fa dr1 = dp1 + (dp1.w*k1 + p1.w*dk1); + + const float scale = 1.0f/3.0f; + CubicBezierCurve L(l0,l0+scale*dl0,l1-scale*dl1,l1); + CubicBezierCurve R(r0,r0+scale*dr0,r1-scale*dr1,r1); + return TensorLinearCubicBezierSurface(L,R); + } + + __forceinline BBox bounds() const { + return merge(L.bounds(),R.bounds()); + } + + __forceinline BBox3fa accurateBounds() const { + return merge(L.accurateBounds(),R.accurateBounds()); + } + + __forceinline CubicBezierCurve reduce_v() const { + return merge(CubicBezierCurve>(L),CubicBezierCurve>(R)); + } + + __forceinline LinearBezierCurve reduce_u() const { + return LinearBezierCurve(L.bounds(),R.bounds()); + } + + __forceinline TensorLinearCubicBezierSurface xfm(const V& dx) const { + return TensorLinearCubicBezierSurface(L.xfm(dx),R.xfm(dx)); + } + + __forceinline TensorLinearCubicBezierSurface vxfm(const V& dx) const { + return TensorLinearCubicBezierSurface(L.vxfm(dx),R.vxfm(dx)); + } + + __forceinline TensorLinearCubicBezierSurface xfm(const V& dx, const V& p) const { + return TensorLinearCubicBezierSurface(L.xfm(dx,p),R.xfm(dx,p)); + } + + __forceinline TensorLinearCubicBezierSurface xfm(const LinearSpace3fa& space) const { + return TensorLinearCubicBezierSurface(L.xfm(space),R.xfm(space)); + } + + __forceinline TensorLinearCubicBezierSurface xfm(const LinearSpace3fa& space, const Vec3fa& p) const { + return TensorLinearCubicBezierSurface(L.xfm(space,p),R.xfm(space,p)); + } + + __forceinline TensorLinearCubicBezierSurface xfm(const LinearSpace3fa& space, const Vec3fa& p, const float s) const { + return TensorLinearCubicBezierSurface(L.xfm(space,p,s),R.xfm(space,p,s)); + } + + __forceinline TensorLinearCubicBezierSurface clip_u(const Interval1f& u) const { + return TensorLinearCubicBezierSurface(L.clip(u),R.clip(u)); + } + + __forceinline TensorLinearCubicBezierSurface clip_v(const Interval1f& v) const { + return TensorLinearCubicBezierSurface(clerp(L,R,V(v.lower)),clerp(L,R,V(v.upper))); + } + + __forceinline TensorLinearCubicBezierSurface clip(const Interval1f& u, const Interval1f& v) const { + return clip_v(v).clip_u(u); + } + + __forceinline void split_u(TensorLinearCubicBezierSurface& left, TensorLinearCubicBezierSurface& right, const float u = 0.5f) const + { + CubicBezierCurve L0,L1; L.split(L0,L1,u); + CubicBezierCurve R0,R1; R.split(R0,R1,u); + new (&left ) TensorLinearCubicBezierSurface(L0,R0); + new (&right) TensorLinearCubicBezierSurface(L1,R1); + } + + __forceinline TensorLinearCubicBezierSurface vsplit_u(vboolx& valid, const BBox1f& u) const { + valid = true; clear(valid,VSIZEX-1); + return TensorLinearCubicBezierSurface(L.split(u),R.split(u)); + } + + __forceinline V eval(const float u, const float v) const { + return clerp(L,R,V(v)).eval(u); + } + + __forceinline V eval_du(const float u, const float v) const { + return clerp(L,R,V(v)).eval_dt(u); + } + + __forceinline V eval_dv(const float u, const float v) const { + return (R-L).eval(u); + } + + __forceinline void eval(const float u, const float v, V& p, V& dpdu, V& dpdv) const + { + V p0, dp0du; L.eval(u,p0,dp0du); + V p1, dp1du; R.eval(u,p1,dp1du); + p = lerp(p0,p1,v); + dpdu = lerp(dp0du,dp1du,v); + dpdv = p1-p0; + } + + __forceinline TensorLinearQuadraticBezierSurface derivative_u() const { + return TensorLinearQuadraticBezierSurface(L.derivative(),R.derivative()); + } + + __forceinline CubicBezierCurve derivative_v() const { + return R-L; + } + + __forceinline V axis_u() const { + return (L.end()-L.begin())+(R.end()-R.begin()); + } + + __forceinline V axis_v() const { + return (R.begin()-L.begin())+(R.end()-L.end()); + } + + friend embree_ostream operator<<(embree_ostream cout, const TensorLinearCubicBezierSurface& a) + { + return cout << "TensorLinearCubicBezierSurface" << embree_endl + << "{" << embree_endl + << " L = " << a.L << ", " << embree_endl + << " R = " << a.R << embree_endl + << "}"; + } + + friend __forceinline TensorLinearCubicBezierSurface clerp(const TensorLinearCubicBezierSurface& a, const TensorLinearCubicBezierSurface& b, const float t) { + return TensorLinearCubicBezierSurface(clerp(a.L,b.L,V(t)), clerp(a.R,b.R,V(t))); + } + }; + + template<> + struct TensorLinearCubicBezierSurface + { + CubicBezierCurve LR; + + __forceinline TensorLinearCubicBezierSurface() {} + + __forceinline TensorLinearCubicBezierSurface(const TensorLinearCubicBezierSurface& curve) + : LR(curve.LR) {} + + __forceinline TensorLinearCubicBezierSurface& operator= (const TensorLinearCubicBezierSurface& other) { + LR = other.LR; return *this; + } + + __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve& LR) + : LR(LR) {} + + __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve& L, const CubicBezierCurve& R) + : LR(shuffle<0,1,0,1>(vfloat4(L.v0),vfloat4(R.v0)),shuffle<0,1,0,1>(vfloat4(L.v1),vfloat4(R.v1)),shuffle<0,1,0,1>(vfloat4(L.v2),vfloat4(R.v2)),shuffle<0,1,0,1>(vfloat4(L.v3),vfloat4(R.v3))) {} + + __forceinline CubicBezierCurve getL() const { + return CubicBezierCurve(Vec2fa(LR.v0),Vec2fa(LR.v1),Vec2fa(LR.v2),Vec2fa(LR.v3)); + } + + __forceinline CubicBezierCurve getR() const { + return CubicBezierCurve(Vec2fa(shuffle<2,3,2,3>(LR.v0)),Vec2fa(shuffle<2,3,2,3>(LR.v1)),Vec2fa(shuffle<2,3,2,3>(LR.v2)),Vec2fa(shuffle<2,3,2,3>(LR.v3))); + } + + __forceinline BBox bounds() const + { + const BBox b = LR.bounds(); + const BBox bl(Vec2fa(b.lower),Vec2fa(b.upper)); + const BBox br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper))); + return merge(bl,br); + } + + __forceinline BBox1f bounds(const Vec2fa& axis) const + { + const CubicBezierCurve LRx = LR; + const CubicBezierCurve LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3)); + const CubicBezierCurve LRa = cmadd(shuffle<0>(vfloat4(axis)),LRx,shuffle<1>(vfloat4(axis))*LRy); + const BBox Lb = LRa.bounds(); + const BBox Rb(shuffle<3>(Lb.lower),shuffle<3>(Lb.upper)); + const BBox b = merge(Lb,Rb); + return BBox1f(b.lower[0],b.upper[0]); + } + + __forceinline TensorLinearCubicBezierSurface xfm(const Vec2fa& dx) const + { + const CubicBezierCurve LRx = LR; + const CubicBezierCurve LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3)); + const CubicBezierCurve LRa = cmadd(shuffle<0>(vfloat4(dx)),LRx,shuffle<1>(vfloat4(dx))*LRy); + return TensorLinearCubicBezierSurface(CubicBezierCurve(LRa.v0[0],LRa.v1[0],LRa.v2[0],LRa.v3[0]), + CubicBezierCurve(LRa.v0[2],LRa.v1[2],LRa.v2[2],LRa.v3[2])); + } + + __forceinline TensorLinearCubicBezierSurface xfm(const Vec2fa& dx, const Vec2fa& p) const + { + const vfloat4 pxyxy = shuffle<0,1,0,1>(vfloat4(p)); + const CubicBezierCurve LRx = LR-pxyxy; + const CubicBezierCurve LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3)); + const CubicBezierCurve LRa = cmadd(shuffle<0>(vfloat4(dx)),LRx,shuffle<1>(vfloat4(dx))*LRy); + return TensorLinearCubicBezierSurface(CubicBezierCurve(LRa.v0[0],LRa.v1[0],LRa.v2[0],LRa.v3[0]), + CubicBezierCurve(LRa.v0[2],LRa.v1[2],LRa.v2[2],LRa.v3[2])); + } + + __forceinline TensorLinearCubicBezierSurface clip_u(const Interval1f& u) const { + return TensorLinearCubicBezierSurface(LR.clip(u)); + } + + __forceinline TensorLinearCubicBezierSurface clip_v(const Interval1f& v) const + { + const CubicBezierCurve LL(shuffle<0,1,0,1>(LR.v0),shuffle<0,1,0,1>(LR.v1),shuffle<0,1,0,1>(LR.v2),shuffle<0,1,0,1>(LR.v3)); + const CubicBezierCurve RR(shuffle<2,3,2,3>(LR.v0),shuffle<2,3,2,3>(LR.v1),shuffle<2,3,2,3>(LR.v2),shuffle<2,3,2,3>(LR.v3)); + return TensorLinearCubicBezierSurface(clerp(LL,RR,vfloat4(v.lower,v.lower,v.upper,v.upper))); + } + + __forceinline TensorLinearCubicBezierSurface clip(const Interval1f& u, const Interval1f& v) const { + return clip_v(v).clip_u(u); + } + + __forceinline void split_u(TensorLinearCubicBezierSurface& left, TensorLinearCubicBezierSurface& right, const float u = 0.5f) const + { + CubicBezierCurve LR0,LR1; LR.split(LR0,LR1,u); + new (&left ) TensorLinearCubicBezierSurface(LR0); + new (&right) TensorLinearCubicBezierSurface(LR1); + } + + __forceinline TensorLinearCubicBezierSurface vsplit_u(vboolx& valid, const BBox1f& u) const { + valid = true; clear(valid,VSIZEX-1); + return TensorLinearCubicBezierSurface(getL().split(u),getR().split(u)); + } + + __forceinline Vec2fa eval(const float u, const float v) const + { + const vfloat4 p = LR.eval(u); + return Vec2fa(lerp(shuffle<0,1,0,1>(p),shuffle<2,3,2,3>(p),v)); + } + + __forceinline Vec2fa eval_du(const float u, const float v) const + { + const vfloat4 dpdu = LR.eval_dt(u); + return Vec2fa(lerp(shuffle<0,1,0,1>(dpdu),shuffle<2,3,2,3>(dpdu),v)); + } + + __forceinline Vec2fa eval_dv(const float u, const float v) const + { + const vfloat4 p = LR.eval(u); + return Vec2fa(shuffle<2,3,2,3>(p)-shuffle<0,1,0,1>(p)); + } + + __forceinline void eval(const float u, const float v, Vec2fa& p, Vec2fa& dpdu, Vec2fa& dpdv) const + { + vfloat4 p0, dp0du; LR.eval(u,p0,dp0du); + p = Vec2fa(lerp(shuffle<0,1,0,1>(p0),shuffle<2,3,2,3>(p0),v)); + dpdu = Vec2fa(lerp(shuffle<0,1,0,1>(dp0du),shuffle<2,3,2,3>(dp0du),v)); + dpdv = Vec2fa(shuffle<2,3,2,3>(p0)-shuffle<0,1,0,1>(p0)); + } + + __forceinline TensorLinearQuadraticBezierSurface derivative_u() const { + return TensorLinearQuadraticBezierSurface(LR.derivative()); + } + + __forceinline CubicBezierCurve derivative_v() const { + return getR()-getL(); + } + + __forceinline Vec2fa axis_u() const + { + const CubicBezierCurve L = getL(); + const CubicBezierCurve R = getR(); + return (L.end()-L.begin())+(R.end()-R.begin()); + } + + __forceinline Vec2fa axis_v() const + { + const CubicBezierCurve L = getL(); + const CubicBezierCurve R = getR(); + return (R.begin()-L.begin())+(R.end()-L.end()); + } + + friend embree_ostream operator<<(embree_ostream cout, const TensorLinearCubicBezierSurface& a) + { + return cout << "TensorLinearCubicBezierSurface" << embree_endl + << "{" << embree_endl + << " L = " << a.getL() << ", " << embree_endl + << " R = " << a.getR() << embree_endl + << "}"; + } + }; + + typedef TensorLinearCubicBezierSurface TensorLinearCubicBezierSurface1f; + typedef TensorLinearCubicBezierSurface TensorLinearCubicBezierSurface2fa; + typedef TensorLinearCubicBezierSurface TensorLinearCubicBezierSurface3fa; + } +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch.h b/thirdparty/embree-aarch64/kernels/subdiv/patch.h new file mode 100644 index 00000000000..d58241b96d6 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/patch.h @@ -0,0 +1,371 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "catmullclark_patch.h" +#include "bilinear_patch.h" +#include "bspline_patch.h" +#include "bezier_patch.h" +#include "gregory_patch.h" +#include "tessellation_cache.h" + +#if 1 +#define PATCH_DEBUG_SUBDIVISION(ptr,x,y,z) +#else +#define PATCH_DEBUG_SUBDIVISION(ptr,x,y,z) \ + { \ + size_t hex = (size_t)ptr; \ + for (size_t i=0; i<4; i++) hex = hex ^ (hex >> 8); \ + const float c = (float)(((hex >> 0) ^ (hex >> 4) ^ (hex >> 8) ^ (hex >> 12) ^ (hex >> 16))&0xf)/15.0f; \ + if (P) *P = Vertex(0.5f+0.5f*x,0.5f+0.5f*y,0.5f+0.5f*z,0.0f); \ + } +#endif + +#define PATCH_MAX_CACHE_DEPTH 2 +//#define PATCH_MIN_RESOLUTION 1 // FIXME: not yet completely implemented +#define PATCH_MAX_EVAL_DEPTH_IRREGULAR 10 // maximum evaluation depth at irregular vertices (has to be larger or equal than PATCH_MAX_CACHE_DEPTH) +#define PATCH_MAX_EVAL_DEPTH_CREASE 10 // maximum evaluation depth at crease features (has to be larger or equal than PATCH_MAX_CACHE_DEPTH) +#define PATCH_USE_GREGORY 1 // 0 = no gregory, 1 = fill, 2 = as early as possible + +#if PATCH_USE_GREGORY==2 +#define PATCH_USE_BEZIER_PATCH 1 // enable use of bezier instead of b-spline patches +#else +#define PATCH_USE_BEZIER_PATCH 0 // enable use of bezier instead of b-spline patches +#endif + +#if PATCH_USE_BEZIER_PATCH +# define RegularPatch BezierPatch +# define RegularPatchT BezierPatchT +#else +# define RegularPatch BSplinePatch +# define RegularPatchT BSplinePatchT +#endif + +#if PATCH_USE_GREGORY +#define IrregularFillPatch GregoryPatch +#define IrregularFillPatchT GregoryPatchT +#else +#define IrregularFillPatch BilinearPatch +#define IrregularFillPatchT BilinearPatchT +#endif + +namespace embree +{ + template + struct __aligned(64) PatchT + { + public: + + typedef GeneralCatmullClarkPatchT GeneralCatmullClarkPatch; + typedef CatmullClarkPatchT CatmullClarkPatch; + typedef CatmullClark1RingT CatmullClarkRing; + typedef BezierCurveT BezierCurve; + + enum Type { + INVALID_PATCH = 0, + BILINEAR_PATCH = 1, + BSPLINE_PATCH = 2, + BEZIER_PATCH = 3, + GREGORY_PATCH = 4, + SUBDIVIDED_GENERAL_PATCH = 7, + SUBDIVIDED_QUAD_PATCH = 8, + EVAL_PATCH = 9, + }; + + struct Ref + { + __forceinline Ref(void* p = nullptr) + : ptr((size_t)p) {} + + __forceinline operator bool() const { return ptr != 0; } + __forceinline operator size_t() const { return ptr; } + + __forceinline Ref (Type ty, void* in) + : ptr(((size_t)in)+ty) { assert((((size_t)in) & 0xF) == 0); } + + __forceinline Type type () const { return (Type)(ptr & 0xF); } + __forceinline void* object() const { return (void*) (ptr & ~0xF); } + + size_t ptr; + }; + + struct EvalPatch + { + /* creates EvalPatch from a CatmullClarkPatch */ + template + __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch) + { + size_t ofs = 0, bytes = patch.bytes(); + void* ptr = alloc(bytes); + patch.serialize(ptr,ofs); + assert(ofs == bytes); + return Ref(EVAL_PATCH, ptr); + } + }; + + struct BilinearPatch + { + /* creates BilinearPatch from a CatmullClarkPatch */ + template + __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch, + const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) { + return Ref(BILINEAR_PATCH, new (alloc(sizeof(BilinearPatch))) BilinearPatch(patch)); + } + + __forceinline BilinearPatch (const CatmullClarkPatch& patch) + : patch(patch) {} + + /* creates BilinearPatch from 4 vertices */ + template + __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) { + return Ref(BILINEAR_PATCH, new (alloc(sizeof(BilinearPatch))) BilinearPatch(edge,vertices,stride)); + } + + __forceinline BilinearPatch (const HalfEdge* edge, const char* vertices, size_t stride) + : patch(edge,vertices,stride) {} + + public: + BilinearPatchT patch; + }; + + struct BSplinePatch + { + /* creates BSplinePatch from a half edge */ + template + __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) { + return Ref(BSPLINE_PATCH, new (alloc(sizeof(BSplinePatch))) BSplinePatch(edge,vertices,stride)); + } + + __forceinline BSplinePatch (const HalfEdge* edge, const char* vertices, size_t stride) + : patch(edge,vertices,stride) {} + + /* creates BSplinePatch from a CatmullClarkPatch */ + template + __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch, + const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) { + return Ref(BSPLINE_PATCH, new (alloc(sizeof(BSplinePatch))) BSplinePatch(patch,border0,border1,border2,border3)); + } + + __forceinline BSplinePatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) + : patch(patch,border0,border1,border2,border3) {} + + public: + BSplinePatchT patch; + }; + + struct BezierPatch + { + /* creates BezierPatch from a half edge */ + template + __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) { + return Ref(BEZIER_PATCH, new (alloc(sizeof(BezierPatch))) BezierPatch(edge,vertices,stride)); + } + + __forceinline BezierPatch (const HalfEdge* edge, const char* vertices, size_t stride) + : patch(edge,vertices,stride) {} + + /* creates Bezier from a CatmullClarkPatch */ + template + __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch, + const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) { + return Ref(BEZIER_PATCH, new (alloc(sizeof(BezierPatch))) BezierPatch(patch,border0,border1,border2,border3)); + } + + __forceinline BezierPatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) + : patch(patch,border0,border1,border2,border3) {} + + public: + BezierPatchT patch; + }; + + struct GregoryPatch + { + /* creates GregoryPatch from half edge */ + template + __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) { + return Ref(GREGORY_PATCH, new (alloc(sizeof(GregoryPatch))) GregoryPatch(edge,vertices,stride)); + } + + __forceinline GregoryPatch (const HalfEdge* edge, const char* vertices, size_t stride) + : patch(CatmullClarkPatch(edge,vertices,stride)) {} + + /* creates GregoryPatch from CatmullClarkPatch */ + template + __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch, + const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) { + return Ref(GREGORY_PATCH, new (alloc(sizeof(GregoryPatch))) GregoryPatch(patch,border0,border1,border2,border3)); + } + + __forceinline GregoryPatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) + : patch(patch,border0,border1,border2,border3) {} + + public: + GregoryPatchT patch; + }; + + struct SubdividedQuadPatch + { + template + __noinline static Ref create(const Allocator& alloc, Ref children[4]) { + return Ref(SUBDIVIDED_QUAD_PATCH, new (alloc(sizeof(SubdividedQuadPatch))) SubdividedQuadPatch(children)); + } + + __forceinline SubdividedQuadPatch(Ref children[4]) { + for (size_t i=0; i<4; i++) child[i] = children[i]; + } + + public: + Ref child[4]; + }; + + struct SubdividedGeneralPatch + { + template + __noinline static Ref create(const Allocator& alloc, Ref* children, const unsigned N) { + return Ref(SUBDIVIDED_GENERAL_PATCH, new (alloc(sizeof(SubdividedGeneralPatch))) SubdividedGeneralPatch(children,N)); + } + + __forceinline SubdividedGeneralPatch(Ref* children, const unsigned N) : N(N) { + for (unsigned i=0; i + __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) + { + if (PATCH_MAX_CACHE_DEPTH == 0) + return nullptr; + + Ref child(0); + switch (edge->patch_type) { + case HalfEdge::BILINEAR_PATCH: child = BilinearPatch::create(alloc,edge,vertices,stride); break; + case HalfEdge::REGULAR_QUAD_PATCH: child = RegularPatch::create(alloc,edge,vertices,stride); break; +#if PATCH_USE_GREGORY == 2 + case HalfEdge::IRREGULAR_QUAD_PATCH: child = GregoryPatch::create(alloc,edge,vertices,stride); break; +#endif + default: { + GeneralCatmullClarkPatch patch(edge,vertices,stride); + child = PatchT::create(alloc,patch,edge,vertices,stride,0); + } + } + return child; + } + + template + __noinline static Ref create(const Allocator& alloc, GeneralCatmullClarkPatch& patch, const HalfEdge* edge, const char* vertices, size_t stride, size_t depth) + { + /* convert into standard quad patch if possible */ + if (likely(patch.isQuadPatch())) + { + CatmullClarkPatch qpatch; patch.init(qpatch); + return PatchT::create(alloc,qpatch,edge,vertices,stride,depth); + } + + /* do only cache up to some depth */ + if (depth >= PATCH_MAX_CACHE_DEPTH) + return nullptr; + + /* subdivide patch */ + unsigned N; + array_t patches; + patch.subdivide(patches,N); + + if (N == 4) + { + Ref child[4]; +#if PATCH_USE_GREGORY == 2 + BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; patch.getLimitBorder(borders); + BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r); + BezierCurve border1l,border1r; borders[1].subdivide(border1l,border1r); + BezierCurve border2l,border2r; borders[2].subdivide(border2l,border2r); + BezierCurve border3l,border3r; borders[3].subdivide(border3l,border3r); + GeneralCatmullClarkPatch::fix_quad_ring_order(patches); + child[0] = PatchT::create(alloc,patches[0],edge,vertices,stride,depth+1,&border0l,nullptr,nullptr,&border3r); + child[1] = PatchT::create(alloc,patches[1],edge,vertices,stride,depth+1,&border0r,&border1l,nullptr,nullptr); + child[2] = PatchT::create(alloc,patches[2],edge,vertices,stride,depth+1,nullptr,&border1r,&border2l,nullptr); + child[3] = PatchT::create(alloc,patches[3],edge,vertices,stride,depth+1,nullptr,nullptr,&border2r,&border3l); +#else + GeneralCatmullClarkPatch::fix_quad_ring_order(patches); + for (size_t i=0; i<4; i++) + child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1); +#endif + return SubdividedQuadPatch::create(alloc,child); + } + else + { + assert(N=max_eval_depth; +//#else + return depth>=max_eval_depth; +//#endif + } + + template + __noinline static Ref create(const Allocator& alloc, CatmullClarkPatch& patch, const HalfEdge* edge, const char* vertices, size_t stride, size_t depth, + const BezierCurve* border0 = nullptr, const BezierCurve* border1 = nullptr, const BezierCurve* border2 = nullptr, const BezierCurve* border3 = nullptr) + { + const typename CatmullClarkPatch::Type ty = patch.type(); + if (unlikely(final(patch,ty,depth))) { + if (ty & CatmullClarkRing::TYPE_REGULAR) return RegularPatch::create(alloc,patch,border0,border1,border2,border3); + else return IrregularFillPatch::create(alloc,patch,border0,border1,border2,border3); + } + else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { + assert(depth > 0); return RegularPatch::create(alloc,patch,border0,border1,border2,border3); + } +#if PATCH_USE_GREGORY == 2 + else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { + assert(depth > 0); return GregoryPatch::create(alloc,patch,border0,border1,border2,border3); + } +#endif + else if (depth >= PATCH_MAX_CACHE_DEPTH) { + return EvalPatch::create(alloc,patch); + } + + else + { + Ref child[4]; + array_t patches; + patch.subdivide(patches); + + for (size_t i=0; i<4; i++) + child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1); + return SubdividedQuadPatch::create(alloc,child); + } + } + }; + + typedef PatchT Patch3fa; +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval.h b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval.h new file mode 100644 index 00000000000..482d015fa31 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval.h @@ -0,0 +1,129 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "patch.h" +#include "feature_adaptive_eval.h" + +namespace embree +{ + namespace isa + { + template + struct PatchEval + { + public: + + typedef PatchT Patch; + typedef typename Patch::Ref Ref; + typedef CatmullClarkPatchT CatmullClarkPatch; + + PatchEval (SharedLazyTessellationCache::CacheEntry& entry, size_t commitCounter, + const HalfEdge* edge, const char* vertices, size_t stride, const float u, const float v, + Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv) + : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv) + { + /* conservative time for the very first allocation */ + auto time = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter); + + Ref patch = SharedLazyTessellationCache::lookup(entry,commitCounter,[&] () { + auto alloc = [&](size_t bytes) { return SharedLazyTessellationCache::malloc(bytes); }; + return Patch::create(alloc,edge,vertices,stride); + },true); + + auto curTime = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter); + const bool allAllocationsValid = SharedLazyTessellationCache::validTime(time,curTime); + + if (patch && allAllocationsValid && eval(patch,u,v,1.0f,0)) { + SharedLazyTessellationCache::unlock(); + return; + } + SharedLazyTessellationCache::unlock(); + FeatureAdaptiveEval(edge,vertices,stride,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv); + PATCH_DEBUG_SUBDIVISION(edge,c,-1,-1); + } + + __forceinline bool eval_quad(const typename Patch::SubdividedQuadPatch* This, const float u, const float v, const float dscale, const size_t depth) + { + if (v < 0.5f) { + if (u < 0.5f) return eval(This->child[0],2.0f*u,2.0f*v,2.0f*dscale,depth+1); + else return eval(This->child[1],2.0f*u-1.0f,2.0f*v,2.0f*dscale,depth+1); + } else { + if (u > 0.5f) return eval(This->child[2],2.0f*u-1.0f,2.0f*v-1.0f,2.0f*dscale,depth+1); + else return eval(This->child[3],2.0f*u,2.0f*v-1.0f,2.0f*dscale,depth+1); + } + } + + bool eval_general(const typename Patch::SubdividedGeneralPatch* This, const float U, const float V, const size_t depth) + { + const unsigned l = (unsigned) floor(0.5f*U); const float u = 2.0f*frac(0.5f*U)-0.5f; + const unsigned h = (unsigned) floor(0.5f*V); const float v = 2.0f*frac(0.5f*V)-0.5f; + const unsigned i = 4*h+l; assert(iN); + return eval(This->child[i],u,v,1.0f,depth+1); + } + + bool eval(Ref This, const float& u, const float& v, const float dscale, const size_t depth) + { + if (!This) return false; + //PRINT(depth); + //PRINT2(u,v); + + switch (This.type()) + { + case Patch::BILINEAR_PATCH: { + //PRINT("bilinear"); + ((typename Patch::BilinearPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(This,-1,c,c); + return true; + } + case Patch::BSPLINE_PATCH: { + //PRINT("bspline"); + ((typename Patch::BSplinePatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(This,-1,c,-1); + return true; + } + case Patch::BEZIER_PATCH: { + //PRINT("bezier"); + ((typename Patch::BezierPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(This,-1,c,-1); + return true; + } + case Patch::GREGORY_PATCH: { + //PRINT("gregory"); + ((typename Patch::GregoryPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); + PATCH_DEBUG_SUBDIVISION(This,-1,-1,c); + return true; + } + case Patch::SUBDIVIDED_QUAD_PATCH: { + //PRINT("subdivided quad"); + return eval_quad(((typename Patch::SubdividedQuadPatch*)This.object()),u,v,dscale,depth); + } + case Patch::SUBDIVIDED_GENERAL_PATCH: { + //PRINT("general_patch"); + assert(dscale == 1.0f); + return eval_general(((typename Patch::SubdividedGeneralPatch*)This.object()),u,v,depth); + } + case Patch::EVAL_PATCH: { + //PRINT("eval_patch"); + CatmullClarkPatch patch; patch.deserialize(This.object()); + FeatureAdaptiveEval(patch,u,v,dscale,depth,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv); + return true; + } + default: + assert(false); + return false; + } + } + + private: + Vertex* const P; + Vertex* const dPdu; + Vertex* const dPdv; + Vertex* const ddPdudu; + Vertex* const ddPdvdv; + Vertex* const ddPdudv; + }; + } +} + diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_grid.h b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_grid.h new file mode 100644 index 00000000000..c05db55f4cd --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_grid.h @@ -0,0 +1,245 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "patch.h" +#include "feature_adaptive_eval_grid.h" + +namespace embree +{ + namespace isa + { + struct PatchEvalGrid + { + typedef Patch3fa Patch; + typedef Patch::Ref Ref; + typedef GeneralCatmullClarkPatch3fa GeneralCatmullClarkPatch; + typedef CatmullClarkPatch3fa CatmullClarkPatch; + typedef BSplinePatch3fa BSplinePatch; + typedef BezierPatch3fa BezierPatch; + typedef GregoryPatch3fa GregoryPatch; + typedef BilinearPatch3fa BilinearPatch; + + private: + const unsigned x0,x1; + const unsigned y0,y1; + const unsigned swidth,sheight; + const float rcp_swidth, rcp_sheight; + float* const Px; + float* const Py; + float* const Pz; + float* const U; + float* const V; + float* const Nx; + float* const Ny; + float* const Nz; + const unsigned dwidth,dheight; + unsigned count; + + public: + + PatchEvalGrid (Ref patch, unsigned subPatch, + const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, + float* Px, float* Py, float* Pz, float* U, float* V, + float* Nx, float* Ny, float* Nz, + const unsigned dwidth, const unsigned dheight) + : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), + Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), dheight(dheight), count(0) + { + assert(swidth < (2<<20) && sheight < (2<<20)); + const BBox2f srange(Vec2f(0.0f,0.0f),Vec2f(float(swidth-1),float(sheight-1))); + const BBox2f erange(Vec2f(float(x0),float(y0)),Vec2f((float)x1,(float)y1)); + bool done MAYBE_UNUSED = eval(patch,subPatch,srange,erange); + assert(done); + assert(count == (x1-x0+1)*(y1-y0+1)); + } + + template + __forceinline void evalLocalGrid(const Patch* patch, const BBox2f& srange, const int lx0, const int lx1, const int ly0, const int ly1) + { + const float scale_x = rcp(srange.upper.x-srange.lower.x); + const float scale_y = rcp(srange.upper.y-srange.lower.y); + count += (lx1-lx0)*(ly1-ly0); + +#if 0 + for (unsigned iy=ly0; iypatch.eval(lu,lv); + const float u = float(ix)*rcp_swidth; + const float v = float(iy)*rcp_sheight; + const int ofs = (iy-y0)*dwidth+(ix-x0); + Px[ofs] = p.x; + Py[ofs] = p.y; + Pz[ofs] = p.z; + U[ofs] = u; + V[ofs] = v; + } + } +#else + foreach2(lx0,lx1,ly0,ly1,[&](const vboolx& valid, const vintx& ix, const vintx& iy) { + const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix)-srange.lower.x)*scale_x); + const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy)-srange.lower.y)*scale_y); + const Vec3vfx p = patch->patch.eval(lu,lv); + Vec3vfx n = zero; + if (unlikely(Nx != nullptr)) n = normalize_safe(patch->patch.normal(lu,lv)); + const vfloatx u = vfloatx(ix)*rcp_swidth; + const vfloatx v = vfloatx(iy)*rcp_sheight; + const vintx ofs = (iy-y0)*dwidth+(ix-x0); + if (likely(all(valid)) && all(iy==iy[0])) { + const unsigned ofs2 = ofs[0]; + vfloatx::storeu(Px+ofs2,p.x); + vfloatx::storeu(Py+ofs2,p.y); + vfloatx::storeu(Pz+ofs2,p.z); + vfloatx::storeu(U+ofs2,u); + vfloatx::storeu(V+ofs2,v); + if (unlikely(Nx != nullptr)) { + vfloatx::storeu(Nx+ofs2,n.x); + vfloatx::storeu(Ny+ofs2,n.y); + vfloatx::storeu(Nz+ofs2,n.z); + } + } else { + foreach_unique_index(valid,iy,[&](const vboolx& valid, const int iy0, const int j) { + const unsigned ofs2 = ofs[j]-j; + vfloatx::storeu(valid,Px+ofs2,p.x); + vfloatx::storeu(valid,Py+ofs2,p.y); + vfloatx::storeu(valid,Pz+ofs2,p.z); + vfloatx::storeu(valid,U+ofs2,u); + vfloatx::storeu(valid,V+ofs2,v); + if (unlikely(Nx != nullptr)) { + vfloatx::storeu(valid,Nx+ofs2,n.x); + vfloatx::storeu(valid,Ny+ofs2,n.y); + vfloatx::storeu(valid,Nz+ofs2,n.z); + } + }); + } + }); +#endif + } + + bool eval(Ref This, const BBox2f& srange, const BBox2f& erange, const unsigned depth) + { + if (erange.empty()) + return true; + + const int lx0 = (int) ceilf(erange.lower.x); + const int lx1 = (int) ceilf(erange.upper.x) + (erange.upper.x == x1 && (srange.lower.x < erange.upper.x || erange.upper.x == 0)); + const int ly0 = (int) ceilf(erange.lower.y); + const int ly1 = (int) ceilf(erange.upper.y) + (erange.upper.y == y1 && (srange.lower.y < erange.upper.y || erange.upper.y == 0)); + if (lx0 >= lx1 || ly0 >= ly1) + return true; + + if (!This) + return false; + + switch (This.type()) + { + case Patch::BILINEAR_PATCH: { + evalLocalGrid((Patch::BilinearPatch*)This.object(),srange,lx0,lx1,ly0,ly1); + return true; + } + case Patch::BSPLINE_PATCH: { + evalLocalGrid((Patch::BSplinePatch*)This.object(),srange,lx0,lx1,ly0,ly1); + return true; + } + case Patch::BEZIER_PATCH: { + evalLocalGrid((Patch::BezierPatch*)This.object(),srange,lx0,lx1,ly0,ly1); + return true; + } + case Patch::GREGORY_PATCH: { + evalLocalGrid((Patch::GregoryPatch*)This.object(),srange,lx0,lx1,ly0,ly1); + return true; + } + case Patch::SUBDIVIDED_QUAD_PATCH: + { + const Vec2f c = srange.center(); + const BBox2f srange0(srange.lower,c); + const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y)); + const BBox2f srange2(c,srange.upper); + const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y)); + + Patch::SubdividedQuadPatch* patch = (Patch::SubdividedQuadPatch*)This.object(); + eval(patch->child[0],srange0,intersect(srange0,erange),depth+1); + eval(patch->child[1],srange1,intersect(srange1,erange),depth+1); + eval(patch->child[2],srange2,intersect(srange2,erange),depth+1); + eval(patch->child[3],srange3,intersect(srange3,erange),depth+1); + return true; + } + case Patch::EVAL_PATCH: { + CatmullClarkPatch patch; patch.deserialize(This.object()); + FeatureAdaptiveEvalGrid(patch,srange,erange,depth,x0,x1,y0,y1,swidth,sheight,Px,Py,Pz,U,V,Nx,Ny,Nz,dwidth,dheight); + count += (lx1-lx0)*(ly1-ly0); + return true; + } + default: + assert(false); + return false; + } + } + + bool eval(Ref This, unsigned subPatch, const BBox2f& srange, const BBox2f& erange) + { + if (!This) + return false; + + switch (This.type()) + { + case Patch::SUBDIVIDED_GENERAL_PATCH: { + Patch::SubdividedGeneralPatch* patch = (Patch::SubdividedGeneralPatch*)This.object(); + assert(subPatch < patch->N); + return eval(patch->child[subPatch],srange,erange,1); + } + default: + assert(subPatch == 0); + return eval(This,srange,erange,0); + } + } + }; + + __forceinline unsigned patch_eval_subdivision_count (const HalfEdge* h) + { + const unsigned N = h->numEdges(); + if (N == 4) return 1; + else return N; + } + + template + inline void patch_eval_subdivision (const HalfEdge* h, Tessellator tessellator) + { + const unsigned N = h->numEdges(); + int neighborSubdiv[GeneralCatmullClarkPatch3fa::SIZE]; // FIXME: use array_t + float levels[GeneralCatmullClarkPatch3fa::SIZE]; + for (unsigned i=0; ihasOpposite() ? h->opposite()->numEdges() != 4 : 0; + levels[i] = h->edge_level; + h = h->next(); + } + if (N == 4) + { + const Vec2f uv[4] = { Vec2f(0.0f,0.0f), Vec2f(1.0f,0.0f), Vec2f(1.0f,1.0f), Vec2f(0.0f,1.0f) }; + tessellator(uv,neighborSubdiv,levels,0); + } + else + { + for (unsigned i=0; i 16"); + const int h = (i >> 2) & 3, l = i & 3; + const Vec2f subPatchID((float)l,(float)h); + const Vec2f uv[4] = { 2.0f*subPatchID + (0.5f+Vec2f(0.0f,0.0f)), + 2.0f*subPatchID + (0.5f+Vec2f(1.0f,0.0f)), + 2.0f*subPatchID + (0.5f+Vec2f(1.0f,1.0f)), + 2.0f*subPatchID + (0.5f+Vec2f(0.0f,1.0f)) }; + const int neighborSubdiv1[4] = { 0,0,0,0 }; + const float levels1[4] = { 0.5f*levels[(i+0)%N], 0.5f*levels[(i+0)%N], 0.5f*levels[(i+N-1)%N], 0.5f*levels[(i+N-1)%N] }; + tessellator(uv,neighborSubdiv1,levels1,i); + } + } + } + } +} + diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_simd.h b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_simd.h new file mode 100644 index 00000000000..28016d9e20d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_simd.h @@ -0,0 +1,127 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "patch.h" +#include "feature_adaptive_eval_simd.h" + +namespace embree +{ + namespace isa + { + template + struct PatchEvalSimd + { + public: + + typedef PatchT Patch; + typedef typename Patch::Ref Ref; + typedef CatmullClarkPatchT CatmullClarkPatch; + + PatchEvalSimd (SharedLazyTessellationCache::CacheEntry& entry, size_t commitCounter, + const HalfEdge* edge, const char* vertices, size_t stride, const vbool& valid0, const vfloat& u, const vfloat& v, + float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N) + : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N) + { + /* conservative time for the very first allocation */ + auto time = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter); + + Ref patch = SharedLazyTessellationCache::lookup(entry,commitCounter,[&] () { + auto alloc = [](size_t bytes) { return SharedLazyTessellationCache::malloc(bytes); }; + return Patch::create(alloc,edge,vertices,stride); + }, true); + + auto curTime = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter); + const bool allAllocationsValid = SharedLazyTessellationCache::validTime(time,curTime); + + patch = allAllocationsValid ? patch : nullptr; + + /* use cached data structure for calculations */ + const vbool valid1 = patch ? eval(valid0,patch,u,v,1.0f,0) : vbool(false); + SharedLazyTessellationCache::unlock(); + const vbool valid2 = valid0 & !valid1; + if (any(valid2)) { + FeatureAdaptiveEvalSimd(edge,vertices,stride,valid2,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dstride,N); + } + } + + vbool eval_quad(const vbool& valid, const typename Patch::SubdividedQuadPatch* This, const vfloat& u, const vfloat& v, const float dscale, const size_t depth) + { + vbool ret = false; + const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f; + const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f; + const vbool u0v0_mask = valid & u0_mask & v0_mask; + const vbool u0v1_mask = valid & u0_mask & v1_mask; + const vbool u1v0_mask = valid & u1_mask & v0_mask; + const vbool u1v1_mask = valid & u1_mask & v1_mask; + if (any(u0v0_mask)) ret |= eval(u0v0_mask,This->child[0],2.0f*u,2.0f*v,2.0f*dscale,depth+1); + if (any(u1v0_mask)) ret |= eval(u1v0_mask,This->child[1],2.0f*u-1.0f,2.0f*v,2.0f*dscale,depth+1); + if (any(u1v1_mask)) ret |= eval(u1v1_mask,This->child[2],2.0f*u-1.0f,2.0f*v-1.0f,2.0f*dscale,depth+1); + if (any(u0v1_mask)) ret |= eval(u0v1_mask,This->child[3],2.0f*u,2.0f*v-1.0f,2.0f*dscale,depth+1); + return ret; + } + + vbool eval_general(const vbool& valid, const typename Patch::SubdividedGeneralPatch* patch, const vfloat& U, const vfloat& V, const size_t depth) + { + vbool ret = false; + const vint l = (vint)floor(0.5f*U); const vfloat u = 2.0f*frac(0.5f*U)-0.5f; + const vint h = (vint)floor(0.5f*V); const vfloat v = 2.0f*frac(0.5f*V)-0.5f; + const vint i = (h<<2)+l; assert(all(valid,iN)); + foreach_unique(valid,i,[&](const vbool& valid, const int i) { + ret |= eval(valid,patch->child[i],u,v,1.0f,depth+1); + }); + return ret; + } + + vbool eval(const vbool& valid, Ref This, const vfloat& u, const vfloat& v, const float dscale, const size_t depth) + { + if (!This) return false; + switch (This.type()) + { + case Patch::BILINEAR_PATCH: { + ((typename Patch::BilinearPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + return valid; + } + case Patch::BSPLINE_PATCH: { + ((typename Patch::BSplinePatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + return valid; + } + case Patch::BEZIER_PATCH: { + ((typename Patch::BezierPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + return valid; + } + case Patch::GREGORY_PATCH: { + ((typename Patch::GregoryPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); + return valid; + } + case Patch::SUBDIVIDED_QUAD_PATCH: { + return eval_quad(valid,((typename Patch::SubdividedQuadPatch*)This.object()),u,v,dscale,depth); + } + case Patch::SUBDIVIDED_GENERAL_PATCH: { + assert(dscale == 1.0f); + return eval_general(valid,((typename Patch::SubdividedGeneralPatch*)This.object()),u,v,depth); + } + case Patch::EVAL_PATCH: { + CatmullClarkPatch patch; patch.deserialize(This.object()); + FeatureAdaptiveEvalSimd(patch,valid,u,v,dscale,depth,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dstride,N); + return valid; + } + default: + assert(false); + return false; + } + } + + private: + float* const P; + float* const dPdu; + float* const dPdv; + float* const ddPdudu; + float* const ddPdvdv; + float* const ddPdudv; + const size_t dstride; + const size_t N; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/subdivpatch1base.h b/thirdparty/embree-aarch64/kernels/subdiv/subdivpatch1base.h new file mode 100644 index 00000000000..d5bc403cca8 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/subdivpatch1base.h @@ -0,0 +1,156 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../geometry/primitive.h" +#include "bspline_patch.h" +#include "bezier_patch.h" +#include "gregory_patch.h" +#include "gregory_patch_dense.h" +#include "tessellation.h" +#include "tessellation_cache.h" +#include "gridrange.h" +#include "patch_eval_grid.h" +#include "feature_adaptive_eval_grid.h" +#include "../common/scene_subdiv_mesh.h" + +namespace embree +{ + struct __aligned(64) SubdivPatch1Base + { + public: + + enum Type { + INVALID_PATCH = 0, + BSPLINE_PATCH = 1, + BEZIER_PATCH = 2, + GREGORY_PATCH = 3, + EVAL_PATCH = 5, + BILINEAR_PATCH = 6, + }; + + enum Flags { + TRANSITION_PATCH = 16, + }; + + /*! Default constructor. */ + __forceinline SubdivPatch1Base () {} + + SubdivPatch1Base (const unsigned int gID, + const unsigned int pID, + const unsigned int subPatch, + const SubdivMesh *const mesh, + const size_t time, + const Vec2f uv[4], + const float edge_level[4], + const int subdiv[4], + const int simd_width); + + __forceinline bool needsStitching() const { + return flags & TRANSITION_PATCH; + } + + __forceinline Vec2f getUV(const size_t i) const { + return Vec2f((float)u[i],(float)v[i]) * (8.0f/0x10000); + } + + static void computeEdgeLevels(const float edge_level[4], const int subdiv[4], float level[4]); + static Vec2i computeGridSize(const float level[4]); + bool updateEdgeLevels(const float edge_level[4], const int subdiv[4], const SubdivMesh *const mesh, const int simd_width); + + public: + + __forceinline size_t getGridBytes() const { + const size_t grid_size_xyzuv = (grid_size_simd_blocks * VSIZEX) * 4; + return 64*((grid_size_xyzuv+15) / 16); + } + + __forceinline void write_lock() { mtx.lock(); } + __forceinline void write_unlock() { mtx.unlock(); } + __forceinline bool try_write_lock() { return mtx.try_lock(); } + //__forceinline bool try_read_lock() { return mtx.try_read_lock(); } + + __forceinline void resetRootRef() { + //assert( mtx.hasInitialState() ); + root_ref = SharedLazyTessellationCache::Tag(); + } + + __forceinline SharedLazyTessellationCache::CacheEntry& entry() { + return (SharedLazyTessellationCache::CacheEntry&) root_ref; + } + + public: + __forceinline unsigned int geomID() const { + return geom; + } + + __forceinline unsigned int primID() const { + return prim; + } + + public: + SharedLazyTessellationCache::Tag root_ref; + SpinLock mtx; + + unsigned short u[4]; //!< 16bit discretized u,v coordinates + unsigned short v[4]; + float level[4]; + + unsigned char flags; + unsigned char type; + unsigned short grid_u_res; + unsigned int geom; //!< geometry ID of the subdivision mesh this patch belongs to + unsigned int prim; //!< primitive ID of this subdivision patch + unsigned short grid_v_res; + + unsigned short grid_size_simd_blocks; + unsigned int time_; + + struct PatchHalfEdge { + const HalfEdge* edge; + unsigned subPatch; + }; + + Vec3fa patch_v[4][4]; + + const HalfEdge *edge() const { return ((PatchHalfEdge*)patch_v)->edge; } + unsigned time() const { return time_; } + unsigned subPatch() const { return ((PatchHalfEdge*)patch_v)->subPatch; } + + void set_edge(const HalfEdge *h) const { ((PatchHalfEdge*)patch_v)->edge = h; } + void set_subPatch(const unsigned s) const { ((PatchHalfEdge*)patch_v)->subPatch = s; } + }; + + namespace isa + { + Vec3fa patchEval(const SubdivPatch1Base& patch, const float uu, const float vv); + Vec3fa patchNormal(const SubdivPatch1Base& patch, const float uu, const float vv); + + template + Vec3 patchEval(const SubdivPatch1Base& patch, const simdf& uu, const simdf& vv); + + template + Vec3 patchNormal(const SubdivPatch1Base& patch, const simdf& uu, const simdf& vv); + + + /* eval grid over patch and stich edges when required */ + void evalGrid(const SubdivPatch1Base& patch, + const unsigned x0, const unsigned x1, + const unsigned y0, const unsigned y1, + const unsigned swidth, const unsigned sheight, + float *__restrict__ const grid_x, + float *__restrict__ const grid_y, + float *__restrict__ const grid_z, + float *__restrict__ const grid_u, + float *__restrict__ const grid_v, + const SubdivMesh* const geom); + + /* eval grid over patch and stich edges when required */ + BBox3fa evalGridBounds(const SubdivPatch1Base& patch, + const unsigned x0, const unsigned x1, + const unsigned y0, const unsigned y1, + const unsigned swidth, const unsigned sheight, + const SubdivMesh* const geom); + } +} diff --git a/thirdparty/embree-aarch64/kernels/subdiv/tessellation.h b/thirdparty/embree-aarch64/kernels/subdiv/tessellation.h new file mode 100644 index 00000000000..bda1e2d5591 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/subdiv/tessellation.h @@ -0,0 +1,161 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +namespace embree +{ + /* adjust discret tessellation level for feature-adaptive pre-subdivision */ + __forceinline float adjustTessellationLevel(float l, const size_t sublevel) + { + for (size_t i=0; i= 2); + + const float inv_low_rate = rcp((float)(low_rate-1)); + const unsigned int dy = low_rate - 1; + const unsigned int dx = high_rate - 1; + + int p = 2*dy-dx; + + unsigned int offset = 0; + unsigned int y = 0; + float value = 0.0f; + for(unsigned int x=0;x data; + }; + + static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; } + + struct CacheEntry + { + Tag tag; + SpinLock mutex; + }; + + private: + + float *data; + bool hugepages; + size_t size; + size_t maxBlocks; + ThreadWorkState *threadWorkState; + + __aligned(64) std::atomic localTime; + __aligned(64) std::atomic next_block; + __aligned(64) SpinLock reset_state; + __aligned(64) SpinLock linkedlist_mtx; + __aligned(64) std::atomic switch_block_threshold; + __aligned(64) std::atomic numRenderThreads; + + + public: + + + SharedLazyTessellationCache(); + ~SharedLazyTessellationCache(); + + void getNextRenderThreadWorkState(); + + __forceinline size_t maxAllocSize() const { + return switch_block_threshold; + } + + __forceinline size_t getCurrentIndex() { return localTime.load(); } + __forceinline void addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); } + + __forceinline size_t getTime(const size_t globalTime) { + return localTime.load()+NUM_CACHE_SEGMENTS*globalTime; + } + + + __forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus); } + __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); } + + __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; } + + static __forceinline void lock () { sharedLazyTessellationCache.lockThread(threadState()); } + static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); } + static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); } + static __forceinline size_t getState() { return threadState()->counter.load(); } + static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); } + + static __forceinline size_t getTCacheTime(const size_t globalTime) { + return sharedLazyTessellationCache.getTime(globalTime); + } + + /* per thread lock */ + __forceinline void lockThreadLoop (ThreadWorkState *const t_state) + { + while(1) + { + size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1); + if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD)) + { + /* lock failed wait until sync phase is over */ + sharedLazyTessellationCache.unlockThread(t_state,-1); + sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0); + } + else + break; + } + } + + static __forceinline void* lookup(CacheEntry& entry, size_t globalTime) + { + const int64_t subdiv_patch_root_ref = entry.tag.get(); + CACHE_STATS(SharedTessellationCacheStats::cache_accesses++); + + if (likely(subdiv_patch_root_ref != 0)) + { + const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr(); + const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); + + if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) )) + { + CACHE_STATS(SharedTessellationCacheStats::cache_hits++); + return (void*) subdiv_patch_root; + } + } + CACHE_STATS(SharedTessellationCacheStats::cache_misses++); + return nullptr; + } + + template + static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor()) + { + ThreadWorkState *t_state = SharedLazyTessellationCache::threadState(); + + while (true) + { + sharedLazyTessellationCache.lockThreadLoop(t_state); + void* patch = SharedLazyTessellationCache::lookup(entry,globalTime); + if (patch) return (decltype(constructor())) patch; + + if (entry.mutex.try_lock()) + { + if (!validTag(entry.tag,globalTime)) + { + auto timeBefore = sharedLazyTessellationCache.getTime(globalTime); + auto ret = constructor(); // thread is locked here! + assert(ret); + /* this should never return nullptr */ + auto timeAfter = sharedLazyTessellationCache.getTime(globalTime); + auto time = before ? timeBefore : timeAfter; + __memory_barrier(); + entry.tag = SharedLazyTessellationCache::Tag(ret,time); + __memory_barrier(); + entry.mutex.unlock(); + return ret; + } + entry.mutex.unlock(); + } + SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state); + } + } + + __forceinline bool validCacheIndex(const size_t i, const size_t globalTime) + { +#if FORCE_SIMPLE_FLUSH == 1 + return i == getTime(globalTime); +#else + return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime); +#endif + } + + static __forceinline bool validTime(const size_t oldtime, const size_t newTime) + { + return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime; + } + + + static __forceinline bool validTag(const Tag& tag, size_t globalTime) + { + const int64_t subdiv_patch_root_ref = tag.get(); + if (subdiv_patch_root_ref == 0) return false; + const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); + return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime); + } + + void waitForUsersLessEqual(ThreadWorkState *const t_state, + const unsigned int users); + + __forceinline size_t alloc(const size_t blocks) + { + if (unlikely(blocks >= switch_block_threshold)) + throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment"); + + assert(blocks < switch_block_threshold); + size_t index = next_block.fetch_add(blocks); + if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1; + return index; + } + + static __forceinline void* malloc(const size_t bytes) + { + size_t block_index = -1; + ThreadWorkState *const t_state = threadState(); + while (true) + { + block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE); + if (block_index == (size_t)-1) + { + sharedLazyTessellationCache.unlockThread(t_state); + sharedLazyTessellationCache.allocNextSegment(); + sharedLazyTessellationCache.lockThread(t_state); + continue; + } + break; + } + return sharedLazyTessellationCache.getBlockPtr(block_index); + } + + __forceinline void *getBlockPtr(const size_t block_index) + { + assert(block_index < maxBlocks); + assert(data); + assert(block_index*16 <= size); + return (void*)&data[block_index*16]; + } + + __forceinline void* getDataPtr() { return data; } + __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; } + __forceinline size_t getMaxBlocks() { return maxBlocks; } + __forceinline size_t getSize() { return size; } + + void allocNextSegment(); + void realloc(const size_t newSize); + + void reset(); + + static SharedLazyTessellationCache sharedLazyTessellationCache; + }; +} diff --git a/thirdparty/embree-aarch64/patches/godot-changes.patch b/thirdparty/embree-aarch64/patches/godot-changes.patch new file mode 100644 index 00000000000..86fbf226d28 --- /dev/null +++ b/thirdparty/embree-aarch64/patches/godot-changes.patch @@ -0,0 +1,630 @@ +diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h +index 76c6b740aa..51d296fb16 100644 +--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h ++++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h +@@ -27,7 +27,10 @@ namespace embree + func(r.begin()); + }); + if (!TaskScheduler::wait()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + } + #elif defined(TASKING_GCD) && defined(BUILD_IOS) + +@@ -55,13 +58,19 @@ namespace embree + func(i); + },context); + if (context.is_group_execution_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + }); + if (tbb::task::self().is_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #endif + + #elif defined(TASKING_PPL) +@@ -81,7 +90,10 @@ namespace embree + #if defined(TASKING_INTERNAL) + TaskScheduler::spawn(first,last,minStepSize,func); + if (!TaskScheduler::wait()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + + #elif defined(TASKING_GCD) && defined(BUILD_IOS) + +@@ -109,13 +121,19 @@ namespace embree + func(range(r.begin(),r.end())); + },context); + if (context.is_group_execution_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #else + tbb::parallel_for(tbb::blocked_range(first,last,minStepSize),[&](const tbb::blocked_range& r) { + func(range(r.begin(),r.end())); + }); + if (tbb::task::self().is_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #endif + + #elif defined(TASKING_PPL) +@@ -147,13 +165,19 @@ namespace embree + func(i); + },tbb::simple_partitioner(),context); + if (context.is_group_execution_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },tbb::simple_partitioner()); + if (tbb::task::self().is_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #endif + } + +@@ -168,13 +192,19 @@ namespace embree + func(i); + },ap,context); + if (context.is_group_execution_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #else + tbb::parallel_for(Index(0),N,Index(1),[&](Index i) { + func(i); + },ap); + if (tbb::task::self().is_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task cancelled"); ++ abort(); ++ // -- GODOT end -- + #endif + } + +diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h +index d444b6a2e4..0daf94e50e 100644 +--- a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h ++++ b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h +@@ -58,15 +58,19 @@ namespace embree + const Value v = tbb::parallel_reduce(tbb::blocked_range(first,last,minStepSize),identity, + [&](const tbb::blocked_range& r, const Value& start) { return reduction(start,func(range(r.begin(),r.end()))); }, + reduction,context); +- if (context.is_group_execution_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // if (context.is_group_execution_cancelled()) ++ // throw std::runtime_error("task cancelled"); ++ // -- GODOT end -- + return v; + #else + const Value v = tbb::parallel_reduce(tbb::blocked_range(first,last,minStepSize),identity, + [&](const tbb::blocked_range& r, const Value& start) { return reduction(start,func(range(r.begin(),r.end()))); }, + reduction); +- if (tbb::task::self().is_cancelled()) +- throw std::runtime_error("task cancelled"); ++ // -- GODOT start -- ++ // if (tbb::task::self().is_cancelled()) ++ // throw std::runtime_error("task cancelled"); ++ // -- GODOT end -- + return v; + #endif + #else // TASKING_PPL +diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp +index 7e7b9faef8..98dc80ad59 100644 +--- a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp ++++ b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp +@@ -39,7 +39,10 @@ namespace embree + std::vector str; str.reserve(64); + while (cin->peek() != EOF && !isSeparator(cin->peek())) { + int c = cin->get(); +- if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input"); ++ // -- GODOT start -- ++ // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input"); ++ if (!isValidChar(c)) abort(); ++ // -- GODOT end -- + str.push_back((char)c); + } + str.push_back(0); +diff --git a/thirdparty/embree-aarch64/common/sys/alloc.cpp b/thirdparty/embree-aarch64/common/sys/alloc.cpp +index 4e8928242e..12f143f131 100644 +--- a/thirdparty/embree-aarch64/common/sys/alloc.cpp ++++ b/thirdparty/embree-aarch64/common/sys/alloc.cpp +@@ -21,7 +21,10 @@ namespace embree + void* ptr = _mm_malloc(size,align); + + if (size != 0 && ptr == nullptr) +- throw std::bad_alloc(); ++ // -- GODOT start -- ++ // throw std::bad_alloc(); ++ abort(); ++ // -- GODOT end -- + + return ptr; + } +@@ -128,7 +131,10 @@ namespace embree + /* fall back to 4k pages */ + int flags = MEM_COMMIT | MEM_RESERVE; + char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE); +- if (ptr == nullptr) throw std::bad_alloc(); ++ // -- GODOT start -- ++ // if (ptr == nullptr) throw std::bad_alloc(); ++ if (ptr == nullptr) abort(); ++ // -- GODOT end -- + hugepages = false; + return ptr; + } +@@ -145,7 +151,10 @@ namespace embree + return bytesOld; + + if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT)) +- throw std::bad_alloc(); ++ // -- GODOT start -- ++ // throw std::bad_alloc(); ++ abort(); ++ // -- GODOT end -- + + return bytesNew; + } +@@ -156,7 +165,10 @@ namespace embree + return; + + if (!VirtualFree(ptr,0,MEM_RELEASE)) +- throw std::bad_alloc(); ++ // -- GODOT start -- ++ // throw std::bad_alloc(); ++ abort(); ++ // -- GODOT end -- + } + + void os_advise(void *ptr, size_t bytes) +@@ -260,7 +272,10 @@ namespace embree + + /* fallback to 4k pages */ + void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); +- if (ptr == MAP_FAILED) throw std::bad_alloc(); ++ // -- GODOT start -- ++ // if (ptr == MAP_FAILED) throw std::bad_alloc(); ++ if (ptr == MAP_FAILED) abort(); ++ // -- GODOT end -- + hugepages = false; + + /* advise huge page hint for THP */ +@@ -277,7 +292,10 @@ namespace embree + return bytesOld; + + if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1) +- throw std::bad_alloc(); ++ // -- GODOT start -- ++ // throw std::bad_alloc(); ++ abort(); ++ // -- GODOT end -- + + return bytesNew; + } +@@ -291,7 +309,10 @@ namespace embree + const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K; + bytes = (bytes+pageSize-1) & ~(pageSize-1); + if (munmap(ptr,bytes) == -1) +- throw std::bad_alloc(); ++ // -- GODOT start -- ++ // throw std::bad_alloc(); ++ abort(); ++ // -- GODOT end -- + } + + /* hint for transparent huge pages (THP) */ +diff --git a/thirdparty/embree-aarch64/common/sys/platform.h b/thirdparty/embree-aarch64/common/sys/platform.h +index 7914eb7a52..737f14aa6e 100644 +--- a/thirdparty/embree-aarch64/common/sys/platform.h ++++ b/thirdparty/embree-aarch64/common/sys/platform.h +@@ -174,11 +174,19 @@ + #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl + + #if defined(DEBUG) // only report file and line in debug mode ++ // -- GODOT start -- ++ // #define THROW_RUNTIME_ERROR(str) ++ // throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); + #define THROW_RUNTIME_ERROR(str) \ +- throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); ++ printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); ++ // -- GODOT end -- + #else ++ // -- GODOT start -- ++ // #define THROW_RUNTIME_ERROR(str) ++ // throw std::runtime_error(str); + #define THROW_RUNTIME_ERROR(str) \ +- throw std::runtime_error(str); ++ abort(); ++ // -- GODOT end -- + #endif + + #define FATAL(x) THROW_RUNTIME_ERROR(x) +diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp +index 98d7fb9249..ebf656d1a0 100644 +--- a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp ++++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp +@@ -48,13 +48,15 @@ namespace embree + { + Task* prevTask = thread.task; + thread.task = this; +- try { +- if (thread.scheduler->cancellingException == nullptr) ++ // -- GODOT start -- ++ // try { ++ // if (thread.scheduler->cancellingException == nullptr) + closure->execute(); +- } catch (...) { +- if (thread.scheduler->cancellingException == nullptr) +- thread.scheduler->cancellingException = std::current_exception(); +- } ++ // } catch (...) { ++ // if (thread.scheduler->cancellingException == nullptr) ++ // thread.scheduler->cancellingException = std::current_exception(); ++ // } ++ // -- GODOT end -- + thread.task = prevTask; + add_dependencies(-1); + } +@@ -297,8 +299,11 @@ namespace embree + size_t threadIndex = allocThreadIndex(); + condition.wait(mutex, [&] () { return hasRootTask.load(); }); + mutex.unlock(); +- std::exception_ptr except = thread_loop(threadIndex); +- if (except != nullptr) std::rethrow_exception(except); ++ // -- GODOT start -- ++ // std::exception_ptr except = thread_loop(threadIndex); ++ // if (except != nullptr) std::rethrow_exception(except); ++ thread_loop(threadIndex); ++ // -- GODOT end -- + } + + void TaskScheduler::reset() { +@@ -330,7 +335,10 @@ namespace embree + return thread->scheduler->cancellingException == nullptr; + } + +- std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex) ++// -- GODOT start -- ++// std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex) ++ void TaskScheduler::thread_loop(size_t threadIndex) ++// -- GODOT end -- + { + /* allocate thread structure */ + std::unique_ptr mthread(new Thread(threadIndex,this)); // too large for stack allocation +@@ -353,9 +361,10 @@ namespace embree + swapThread(oldThread); + + /* remember exception to throw */ +- std::exception_ptr except = nullptr; +- if (cancellingException != nullptr) except = cancellingException; +- ++ // -- GODOT start -- ++ // std::exception_ptr except = nullptr; ++ // if (cancellingException != nullptr) except = cancellingException; ++ // -- GODOT end -- + /* wait for all threads to terminate */ + threadCounter--; + #if defined(__WIN32__) +@@ -373,7 +382,10 @@ namespace embree + yield(); + #endif + } +- return except; ++ // -- GODOT start -- ++ // return except; ++ return; ++ // -- GODOT end -- + } + + bool TaskScheduler::steal_from_other_threads(Thread& thread) +diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h +index c2a9391aea..8bd70b2b8c 100644 +--- a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h ++++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h +@@ -123,7 +123,10 @@ namespace embree + { + size_t ofs = bytes + ((align - stackPtr) & (align-1)); + if (stackPtr + ofs > CLOSURE_STACK_SIZE) +- throw std::runtime_error("closure stack overflow"); ++ // -- GODOT start -- ++ // throw std::runtime_error("closure stack overflow"); ++ abort(); ++ // -- GODOT end -- + stackPtr += ofs; + return &stack[stackPtr-bytes]; + } +@@ -132,7 +135,10 @@ namespace embree + __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure) + { + if (right >= TASK_STACK_SIZE) +- throw std::runtime_error("task stack overflow"); ++ // -- GODOT start -- ++ // throw std::runtime_error("task stack overflow"); ++ abort(); ++ // -- GODOT end -- + + /* allocate new task on right side of stack */ + size_t oldStackPtr = stackPtr; +@@ -239,7 +245,10 @@ namespace embree + void wait_for_threads(size_t threadCount); + + /*! thread loop for all worker threads */ +- std::exception_ptr thread_loop(size_t threadIndex); ++ // -- GODOT start -- ++ // std::exception_ptr thread_loop(size_t threadIndex); ++ void thread_loop(size_t threadIndex); ++ // -- GODOT end -- + + /*! steals a task from a different thread */ + bool steal_from_other_threads(Thread& thread); +diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp +index 20cdd2d320..aa56035026 100644 +--- a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp ++++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp +@@ -150,7 +150,10 @@ namespace embree + } + } + else { +- throw std::runtime_error("not supported node type in bvh_statistics"); ++ // -- GODOT start -- ++ // throw std::runtime_error("not supported node type in bvh_statistics"); ++ abort(); ++ // -- GODOT end -- + } + return s; + } +diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp +index ee5c37b238..625fbf6d4f 100644 +--- a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp ++++ b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp +@@ -230,7 +230,10 @@ RTC_NAMESPACE_BEGIN; + if (quality != RTC_BUILD_QUALITY_LOW && + quality != RTC_BUILD_QUALITY_MEDIUM && + quality != RTC_BUILD_QUALITY_HIGH) +- throw std::runtime_error("invalid build quality"); ++ // -- GODOT start -- ++ // throw std::runtime_error("invalid build quality"); ++ abort(); ++ // -- GODOT end -- + scene->setBuildQuality(quality); + RTC_CATCH_END2(scene); + } +@@ -1383,7 +1386,10 @@ RTC_NAMESPACE_BEGIN; + quality != RTC_BUILD_QUALITY_MEDIUM && + quality != RTC_BUILD_QUALITY_HIGH && + quality != RTC_BUILD_QUALITY_REFIT) +- throw std::runtime_error("invalid build quality"); ++ // -- GODOT start -- ++ // throw std::runtime_error("invalid build quality"); ++ abort(); ++ // -- GODOT end -- + geometry->setBuildQuality(quality); + RTC_CATCH_END2(geometry); + } +diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.h b/thirdparty/embree-aarch64/kernels/common/rtcore.h +index 6583d12d57..4b070e122b 100644 +--- a/thirdparty/embree-aarch64/kernels/common/rtcore.h ++++ b/thirdparty/embree-aarch64/kernels/common/rtcore.h +@@ -25,52 +25,58 @@ namespace embree + #endif + + /*! Macros used in the rtcore API implementation */ +-#define RTC_CATCH_BEGIN try { ++// -- GODOT start -- ++// #define RTC_CATCH_BEGIN try { ++#define RTC_CATCH_BEGIN + +-#define RTC_CATCH_END(device) \ +- } catch (std::bad_alloc&) { \ +- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +- } catch (rtcore_error& e) { \ +- Device::process_error(device,e.error,e.what()); \ +- } catch (std::exception& e) { \ +- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +- } catch (...) { \ +- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +- } ++// #define RTC_CATCH_END(device) \ ++// } catch (std::bad_alloc&) { \ ++// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ ++// } catch (rtcore_error& e) { \ ++// Device::process_error(device,e.error,e.what()); \ ++// } catch (std::exception& e) { \ ++// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ ++// } catch (...) { \ ++// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ ++// } ++#define RTC_CATCH_END(device) + +-#define RTC_CATCH_END2(scene) \ +- } catch (std::bad_alloc&) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +- } catch (rtcore_error& e) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,e.error,e.what()); \ +- } catch (std::exception& e) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +- } catch (...) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +- } ++// #define RTC_CATCH_END2(scene) \ ++// } catch (std::bad_alloc&) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ ++// } catch (rtcore_error& e) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,e.error,e.what()); \ ++// } catch (std::exception& e) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ ++// } catch (...) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ ++// } ++#define RTC_CATCH_END2(scene) + +-#define RTC_CATCH_END2_FALSE(scene) \ +- } catch (std::bad_alloc&) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ +- return false; \ +- } catch (rtcore_error& e) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,e.error,e.what()); \ +- return false; \ +- } catch (std::exception& e) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ +- return false; \ +- } catch (...) { \ +- Device* device = scene ? scene->device : nullptr; \ +- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ +- return false; \ +- } ++// #define RTC_CATCH_END2_FALSE(scene) \ ++// } catch (std::bad_alloc&) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ ++// return false; \ ++// } catch (rtcore_error& e) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,e.error,e.what()); \ ++// return false; \ ++// } catch (std::exception& e) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ ++// return false; \ ++// } catch (...) { \ ++// Device* device = scene ? scene->device : nullptr; \ ++// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ ++// return false; \ ++// } ++#define RTC_CATCH_END2_FALSE(scene) return false; ++// -- GODOT end -- + + #define RTC_VERIFY_HANDLE(handle) \ + if (handle == nullptr) { \ +@@ -97,28 +103,38 @@ namespace embree + #define RTC_TRACE(x) + #endif + +- /*! used to throw embree API errors */ +- struct rtcore_error : public std::exception +- { +- __forceinline rtcore_error(RTCError error, const std::string& str) +- : error(error), str(str) {} +- +- ~rtcore_error() throw() {} +- +- const char* what () const throw () { +- return str.c_str(); +- } +- +- RTCError error; +- std::string str; +- }; ++// -- GODOT begin -- ++// /*! used to throw embree API errors */ ++// struct rtcore_error : public std::exception ++// { ++// __forceinline rtcore_error(RTCError error, const std::string& str) ++// : error(error), str(str) {} ++// ++// ~rtcore_error() throw() {} ++// ++// const char* what () const throw () { ++// return str.c_str(); ++// } ++// ++// RTCError error; ++// std::string str; ++// }; ++// -- GODOT end -- + + #if defined(DEBUG) // only report file and line in debug mode ++ // -- GODOT begin -- ++ // #define throw_RTCError(error,str) \ ++ // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); + #define throw_RTCError(error,str) \ +- throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); ++ printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort(); ++ // -- GODOT end -- + #else ++ // -- GODOT begin -- ++ // #define throw_RTCError(error,str) \ ++ // throw rtcore_error(error,str); + #define throw_RTCError(error,str) \ +- throw rtcore_error(error,str); ++ abort(); ++ // -- GODOT end -- + #endif + + #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \ +diff --git a/thirdparty/embree-aarch64/kernels/common/scene.cpp b/thirdparty/embree-aarch64/kernels/common/scene.cpp +index e75aa968f9..1e23aeb415 100644 +--- a/thirdparty/embree-aarch64/kernels/common/scene.cpp ++++ b/thirdparty/embree-aarch64/kernels/common/scene.cpp +@@ -800,16 +800,18 @@ namespace embree + } + + /* initiate build */ +- try { ++ // -- GODOT start -- ++ // try { + scheduler->spawn_root([&]() { commit_task(); Lock lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join); +- } +- catch (...) { +- accels_clear(); +- updateInterface(); +- Lock lock(schedulerMutex); +- this->scheduler = nullptr; +- throw; +- } ++ // } ++ // catch (...) { ++ // accels_clear(); ++ // updateInterface(); ++ // Lock lock(schedulerMutex); ++ // this->scheduler = nullptr; ++ // throw; ++ // } ++ // -- GODOT end -- + } + + #endif