From 057367bf4f23850eb455585c3845b0bebee2aa03 Mon Sep 17 00:00:00 2001 From: Dario Date: Fri, 22 Sep 2023 18:38:02 -0300 Subject: [PATCH] Add FidelityFX Super Resolution 2.2 (FSR 2.2.1) support. Introduces support for FSR2 as a new upscaler option available from the project settings. Also introduces an specific render list for surfaces that require motion and the ability to derive motion vectors from depth buffer and camera motion. --- COPYRIGHT.txt | 5 + doc/classes/RenderSceneBuffersRD.xml | 13 + doc/classes/RenderingServer.xml | 8 +- doc/classes/Viewport.xml | 8 +- drivers/vulkan/rendering_device_vulkan.cpp | 12 +- drivers/vulkan/vulkan_context.cpp | 23 + drivers/vulkan/vulkan_context.h | 3 + editor/plugins/node_3d_editor_plugin.cpp | 6 +- editor/plugins/node_3d_editor_plugin.h | 1 + scene/main/viewport.cpp | 4 +- scene/main/viewport.h | 2 + servers/rendering/renderer_rd/effects/SCsub | 31 +- .../renderer_rd/effects/copy_effects.cpp | 4 +- .../renderer_rd/effects/debug_effects.cpp | 23 +- .../renderer_rd/effects/debug_effects.h | 8 +- .../rendering/renderer_rd/effects/fsr2.cpp | 889 +++++ servers/rendering/renderer_rd/effects/fsr2.h | 199 ++ servers/rendering/renderer_rd/effects/taa.cpp | 14 - servers/rendering/renderer_rd/effects/taa.h | 1 - .../rendering/renderer_rd/environment/sky.cpp | 11 +- .../rendering/renderer_rd/environment/sky.h | 2 +- .../render_forward_clustered.cpp | 226 +- .../render_forward_clustered.h | 14 +- .../forward_mobile/render_forward_mobile.cpp | 6 +- .../renderer_rd/renderer_scene_render_rd.cpp | 37 +- .../renderer_rd/shaders/effects/SCsub | 2 + .../renderer_rd/shaders/effects/fsr2/SCsub | 17 + .../effects/fsr2/fsr2_accumulate_pass.glsl | 8 + .../fsr2/fsr2_autogen_reactive_pass.glsl | 8 + .../fsr2_compute_luminance_pyramid_pass.glsl | 7 + .../effects/fsr2/fsr2_depth_clip_pass.glsl | 8 + .../shaders/effects/fsr2/fsr2_lock_pass.glsl | 7 + .../shaders/effects/fsr2/fsr2_rcas_pass.glsl | 7 + .../fsr2_reconstruct_previous_depth_pass.glsl | 8 + .../effects/fsr2/fsr2_tcr_autogen_pass.glsl | 8 + .../shaders/effects/motion_vector_inc.glsl | 6 + .../shaders/effects/motion_vectors.glsl | 19 +- .../scene_forward_clustered.glsl | 2 + .../renderer_rd/shaders/scene_data_inc.glsl | 2 +- .../renderer_rd/storage_rd/mesh_storage.cpp | 12 +- .../renderer_rd/storage_rd/mesh_storage.h | 3 + .../storage_rd/render_scene_buffers_rd.cpp | 60 +- .../storage_rd/render_scene_buffers_rd.h | 36 +- .../storage_rd/render_scene_data_rd.cpp | 3 +- .../storage_rd/render_scene_data_rd.h | 4 +- servers/rendering/renderer_scene_cull.cpp | 49 +- servers/rendering/renderer_scene_cull.h | 5 +- servers/rendering/renderer_viewport.cpp | 59 +- servers/rendering/renderer_viewport.h | 2 + servers/rendering/rendering_device.h | 18 + servers/rendering/rendering_method.h | 2 +- .../storage/render_scene_buffers.cpp | 2 +- servers/rendering_server.cpp | 4 +- servers/rendering_server.h | 2 + thirdparty/README.md | 15 + thirdparty/amd-fsr2/LICENSE.txt | 21 + thirdparty/amd-fsr2/ffx_assert.cpp | 81 + thirdparty/amd-fsr2/ffx_assert.h | 132 + thirdparty/amd-fsr2/ffx_error.h | 59 + thirdparty/amd-fsr2/ffx_fsr2.cpp | 1373 ++++++++ thirdparty/amd-fsr2/ffx_fsr2.h | 458 +++ thirdparty/amd-fsr2/ffx_fsr2_interface.h | 395 +++ thirdparty/amd-fsr2/ffx_fsr2_maximum_bias.h | 46 + thirdparty/amd-fsr2/ffx_fsr2_private.h | 86 + thirdparty/amd-fsr2/ffx_types.h | 367 ++ thirdparty/amd-fsr2/ffx_util.h | 78 + .../amd-fsr2/patches/godot-changes.patch | Bin 0 -> 21350 bytes .../amd-fsr2/shaders/ffx_common_types.h | 429 +++ thirdparty/amd-fsr2/shaders/ffx_core.h | 52 + thirdparty/amd-fsr2/shaders/ffx_core_cpu.h | 332 ++ thirdparty/amd-fsr2/shaders/ffx_core_glsl.h | 1669 +++++++++ .../amd-fsr2/shaders/ffx_core_gpu_common.h | 2784 +++++++++++++++ .../shaders/ffx_core_gpu_common_half.h | 2978 +++++++++++++++++ thirdparty/amd-fsr2/shaders/ffx_core_hlsl.h | 1502 +++++++++ .../amd-fsr2/shaders/ffx_core_portability.h | 50 + thirdparty/amd-fsr2/shaders/ffx_fsr1.h | 1250 +++++++ .../amd-fsr2/shaders/ffx_fsr2_accumulate.h | 295 ++ .../shaders/ffx_fsr2_accumulate_pass.glsl | 92 + .../ffx_fsr2_autogen_reactive_pass.glsl | 93 + .../shaders/ffx_fsr2_callbacks_glsl.h | 704 ++++ .../shaders/ffx_fsr2_callbacks_hlsl.h | 799 +++++ thirdparty/amd-fsr2/shaders/ffx_fsr2_common.h | 565 ++++ .../ffx_fsr2_compute_luminance_pyramid.h | 189 ++ ...x_fsr2_compute_luminance_pyramid_pass.glsl | 134 + .../amd-fsr2/shaders/ffx_fsr2_depth_clip.h | 258 ++ .../shaders/ffx_fsr2_depth_clip_pass.glsl | 67 + .../amd-fsr2/shaders/ffx_fsr2_force16_begin.h | 1 + .../amd-fsr2/shaders/ffx_fsr2_force16_end.h | 1 + thirdparty/amd-fsr2/shaders/ffx_fsr2_lock.h | 115 + .../amd-fsr2/shaders/ffx_fsr2_lock_pass.glsl | 56 + .../ffx_fsr2_postprocess_lock_status.h | 106 + thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas.h | 67 + .../amd-fsr2/shaders/ffx_fsr2_rcas_pass.glsl | 80 + ...ruct_dilated_velocity_and_previous_depth.h | 145 + ..._fsr2_reconstruct_previous_depth_pass.glsl | 65 + .../amd-fsr2/shaders/ffx_fsr2_reproject.h | 136 + .../amd-fsr2/shaders/ffx_fsr2_resources.h | 105 + thirdparty/amd-fsr2/shaders/ffx_fsr2_sample.h | 605 ++++ .../amd-fsr2/shaders/ffx_fsr2_tcr_autogen.h | 250 ++ .../shaders/ffx_fsr2_tcr_autogen_pass.glsl | 122 + .../amd-fsr2/shaders/ffx_fsr2_upsample.h | 194 ++ thirdparty/amd-fsr2/shaders/ffx_spd.h | 936 ++++++ 102 files changed, 22108 insertions(+), 149 deletions(-) create mode 100644 servers/rendering/renderer_rd/effects/fsr2.cpp create mode 100644 servers/rendering/renderer_rd/effects/fsr2.h create mode 100644 servers/rendering/renderer_rd/shaders/effects/fsr2/SCsub create mode 100644 servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_accumulate_pass.glsl create mode 100644 servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_autogen_reactive_pass.glsl create mode 100644 servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_compute_luminance_pyramid_pass.glsl create mode 100644 servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_depth_clip_pass.glsl create mode 100644 servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_lock_pass.glsl create mode 100644 servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_rcas_pass.glsl create mode 100644 servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_reconstruct_previous_depth_pass.glsl create mode 100644 servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_tcr_autogen_pass.glsl create mode 100644 servers/rendering/renderer_rd/shaders/effects/motion_vector_inc.glsl create mode 100644 thirdparty/amd-fsr2/LICENSE.txt create mode 100644 thirdparty/amd-fsr2/ffx_assert.cpp create mode 100644 thirdparty/amd-fsr2/ffx_assert.h create mode 100644 thirdparty/amd-fsr2/ffx_error.h create mode 100644 thirdparty/amd-fsr2/ffx_fsr2.cpp create mode 100644 thirdparty/amd-fsr2/ffx_fsr2.h create mode 100644 thirdparty/amd-fsr2/ffx_fsr2_interface.h create mode 100644 thirdparty/amd-fsr2/ffx_fsr2_maximum_bias.h create mode 100644 thirdparty/amd-fsr2/ffx_fsr2_private.h create mode 100644 thirdparty/amd-fsr2/ffx_types.h create mode 100644 thirdparty/amd-fsr2/ffx_util.h create mode 100644 thirdparty/amd-fsr2/patches/godot-changes.patch create mode 100644 thirdparty/amd-fsr2/shaders/ffx_common_types.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_core.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_core_cpu.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_core_glsl.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_core_gpu_common.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_core_gpu_common_half.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_core_hlsl.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_core_portability.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr1.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_autogen_reactive_pass.glsl create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_glsl.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_hlsl.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_common.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip_pass.glsl create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_force16_begin.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_force16_end.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_lock.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_lock_pass.glsl create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_postprocess_lock_status.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas_pass.glsl create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_dilated_velocity_and_previous_depth.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_reproject.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_resources.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_sample.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl create mode 100644 thirdparty/amd-fsr2/shaders/ffx_fsr2_upsample.h create mode 100644 thirdparty/amd-fsr2/shaders/ffx_spd.h diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt index 500eb84d08b..3200eafa00d 100644 --- a/COPYRIGHT.txt +++ b/COPYRIGHT.txt @@ -141,6 +141,11 @@ Comment: AMD FidelityFX Super Resolution Copyright: 2021, Advanced Micro Devices, Inc. License: Expat +Files: ./thirdparty/amd-fsr2/ +Comment: AMD FidelityFX Super Resolution 2 +Copyright: 2022-2023, Advanced Micro Devices, Inc. +License: Expat + Files: ./thirdparty/angle/ Comment: ANGLE Copyright: 2018, The ANGLE Project Authors. diff --git a/doc/classes/RenderSceneBuffersRD.xml b/doc/classes/RenderSceneBuffersRD.xml index d40d835d269..6ab1e6bbbd3 100644 --- a/doc/classes/RenderSceneBuffersRD.xml +++ b/doc/classes/RenderSceneBuffersRD.xml @@ -130,6 +130,19 @@ Returns the texture size of a given slice of a cached texture. + + + + + + + + + + + Returns a specific view of a slice (layer or mipmap) for a cached texture. + + diff --git a/doc/classes/RenderingServer.xml b/doc/classes/RenderingServer.xml index 7331a7fe32c..9f64cbf6103 100644 --- a/doc/classes/RenderingServer.xml +++ b/doc/classes/RenderingServer.xml @@ -4507,7 +4507,10 @@ Use AMD FidelityFX Super Resolution 1.0 upscaling for the viewport's 3D buffer. The amount of scaling can be set using [member Viewport.scaling_3d_scale]. Values less than [code]1.0[/code] will be result in the viewport being upscaled using FSR. Values greater than [code]1.0[/code] are not supported and bilinear downsampling will be used instead. A value of [code]1.0[/code] disables scaling. - + + Use AMD FidelityFX Super Resolution 2.2 upscaling for the viewport's 3D buffer. The amount of scaling can be set using [member Viewport.scaling_3d_scale]. Values less than [code]1.0[/code] will be result in the viewport being upscaled using FSR2. Values greater than [code]1.0[/code] are not supported and bilinear downsampling will be used instead. A value of [code]1.0[/code] will use FSR2 at native resolution as a TAA solution. + + Represents the size of the [enum ViewportScaling3DMode] enum. @@ -4708,6 +4711,9 @@ Draws the motion vectors buffer. This is used by temporal antialiasing to correct for motion that occurs during gameplay. + + Internal buffer is drawn instead of regular scene so you can see the per-pixel output that will be used by post-processing effects. + Variable rate shading is disabled. diff --git a/doc/classes/Viewport.xml b/doc/classes/Viewport.xml index 413d9462fec..1b5f7148ac2 100644 --- a/doc/classes/Viewport.xml +++ b/doc/classes/Viewport.xml @@ -443,7 +443,10 @@ Use AMD FidelityFX Super Resolution 1.0 upscaling for the viewport's 3D buffer. The amount of scaling can be set using [member scaling_3d_scale]. Values less than [code]1.0[/code] will be result in the viewport being upscaled using FSR. Values greater than [code]1.0[/code] are not supported and bilinear downsampling will be used instead. A value of [code]1.0[/code] disables scaling. - + + Use AMD FidelityFX Super Resolution 2.2 upscaling for the viewport's 3D buffer. The amount of scaling can be set using [member Viewport.scaling_3d_scale]. Values less than [code]1.0[/code] will be result in the viewport being upscaled using FSR2. Values greater than [code]1.0[/code] are not supported and bilinear downsampling will be used instead. A value of [code]1.0[/code] will use FSR2 at native resolution as a TAA solution. + + Represents the size of the [enum Scaling3DMode] enum. @@ -553,6 +556,9 @@ + + Draws the internal resolution buffer of the scene before post-processing is applied. + The texture filter reads from the nearest pixel only. The simplest and fastest method of filtering, but the texture will look pixelized. diff --git a/drivers/vulkan/rendering_device_vulkan.cpp b/drivers/vulkan/rendering_device_vulkan.cpp index b2f4123fb9e..11321b2121a 100644 --- a/drivers/vulkan/rendering_device_vulkan.cpp +++ b/drivers/vulkan/rendering_device_vulkan.cpp @@ -5750,8 +5750,8 @@ RID RenderingDeviceVulkan::uniform_set_create(const Vector &p_uniforms, Buffer *buffer = uniform_buffer_owner.get_or_null(uniform.get_id(0)); ERR_FAIL_NULL_V_MSG(buffer, RID(), "Uniform buffer supplied (binding: " + itos(uniform.binding) + ") is invalid."); - ERR_FAIL_COND_V_MSG(buffer->size != (uint32_t)set_uniform.length, RID(), - "Uniform buffer supplied (binding: " + itos(uniform.binding) + ") size (" + itos(buffer->size) + " does not match size of shader uniform: (" + itos(set_uniform.length) + ")."); + ERR_FAIL_COND_V_MSG(buffer->size < (uint32_t)set_uniform.length, RID(), + "Uniform buffer supplied (binding: " + itos(uniform.binding) + ") size (" + itos(buffer->size) + " is smaller than size of shader uniform: (" + itos(set_uniform.length) + ")."); write.dstArrayElement = 0; write.descriptorCount = 1; @@ -9562,6 +9562,14 @@ uint64_t RenderingDeviceVulkan::limit_get(Limit p_limit) const { VulkanContext::SubgroupCapabilities subgroup_capabilities = context->get_subgroup_capabilities(); return subgroup_capabilities.size; } + case LIMIT_SUBGROUP_MIN_SIZE: { + VulkanContext::SubgroupCapabilities subgroup_capabilities = context->get_subgroup_capabilities(); + return subgroup_capabilities.min_size; + } + case LIMIT_SUBGROUP_MAX_SIZE: { + VulkanContext::SubgroupCapabilities subgroup_capabilities = context->get_subgroup_capabilities(); + return subgroup_capabilities.max_size; + } case LIMIT_SUBGROUP_IN_SHADERS: { VulkanContext::SubgroupCapabilities subgroup_capabilities = context->get_subgroup_capabilities(); return subgroup_capabilities.supported_stages_flags_rd(); diff --git a/drivers/vulkan/vulkan_context.cpp b/drivers/vulkan/vulkan_context.cpp index 344ea0d3240..99ee6d64d9e 100644 --- a/drivers/vulkan/vulkan_context.cpp +++ b/drivers/vulkan/vulkan_context.cpp @@ -504,6 +504,7 @@ Error VulkanContext::_initialize_device_extensions() { register_requested_device_extension(VK_KHR_IMAGE_FORMAT_LIST_EXTENSION_NAME, false); register_requested_device_extension(VK_KHR_MAINTENANCE_2_EXTENSION_NAME, false); register_requested_device_extension(VK_EXT_PIPELINE_CREATION_CACHE_CONTROL_EXTENSION_NAME, false); + register_requested_device_extension(VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, false); if (Engine::get_singleton()->is_generate_spirv_debug_info_enabled()) { register_requested_device_extension(VK_KHR_SHADER_NON_SEMANTIC_INFO_EXTENSION_NAME, true); @@ -739,9 +740,12 @@ Error VulkanContext::_check_capabilities() { multiview_capabilities.max_view_count = 0; multiview_capabilities.max_instance_count = 0; subgroup_capabilities.size = 0; + subgroup_capabilities.min_size = 0; + subgroup_capabilities.max_size = 0; subgroup_capabilities.supportedStages = 0; subgroup_capabilities.supportedOperations = 0; subgroup_capabilities.quadOperationsInAllStages = false; + subgroup_capabilities.size_control_is_supported = false; shader_capabilities.shader_float16_is_supported = false; shader_capabilities.shader_int8_is_supported = false; storage_buffer_capabilities.storage_buffer_16_bit_access_is_supported = false; @@ -886,6 +890,7 @@ Error VulkanContext::_check_capabilities() { VkPhysicalDeviceFragmentShadingRatePropertiesKHR vrsProperties{}; VkPhysicalDeviceMultiviewProperties multiviewProperties{}; VkPhysicalDeviceSubgroupProperties subgroupProperties{}; + VkPhysicalDeviceSubgroupSizeControlProperties subgroupSizeControlProperties = {}; VkPhysicalDeviceProperties2 physicalDeviceProperties{}; void *nextptr = nullptr; @@ -894,6 +899,15 @@ Error VulkanContext::_check_capabilities() { subgroupProperties.pNext = nextptr; nextptr = &subgroupProperties; + + subgroup_capabilities.size_control_is_supported = is_device_extension_enabled(VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME); + + if (subgroup_capabilities.size_control_is_supported) { + subgroupSizeControlProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES; + subgroupSizeControlProperties.pNext = nextptr; + + nextptr = &subgroupSizeControlProperties; + } } if (multiview_capabilities.is_supported) { @@ -916,6 +930,8 @@ Error VulkanContext::_check_capabilities() { device_properties_func(gpu, &physicalDeviceProperties); subgroup_capabilities.size = subgroupProperties.subgroupSize; + subgroup_capabilities.min_size = subgroupProperties.subgroupSize; + subgroup_capabilities.max_size = subgroupProperties.subgroupSize; subgroup_capabilities.supportedStages = subgroupProperties.supportedStages; subgroup_capabilities.supportedOperations = subgroupProperties.supportedOperations; // Note: quadOperationsInAllStages will be true if: @@ -923,6 +939,11 @@ Error VulkanContext::_check_capabilities() { // - supportedOperations has VK_SUBGROUP_FEATURE_QUAD_BIT. subgroup_capabilities.quadOperationsInAllStages = subgroupProperties.quadOperationsInAllStages; + if (subgroup_capabilities.size_control_is_supported && (subgroupSizeControlProperties.requiredSubgroupSizeStages & VK_SHADER_STAGE_COMPUTE_BIT)) { + subgroup_capabilities.min_size = subgroupSizeControlProperties.minSubgroupSize; + subgroup_capabilities.max_size = subgroupSizeControlProperties.maxSubgroupSize; + } + if (vrs_capabilities.pipeline_vrs_supported || vrs_capabilities.primitive_vrs_supported || vrs_capabilities.attachment_vrs_supported) { print_verbose("- Vulkan Variable Rate Shading supported:"); if (vrs_capabilities.pipeline_vrs_supported) { @@ -962,6 +983,8 @@ Error VulkanContext::_check_capabilities() { print_verbose("- Vulkan subgroup:"); print_verbose(" size: " + itos(subgroup_capabilities.size)); + print_verbose(" min size: " + itos(subgroup_capabilities.min_size)); + print_verbose(" max size: " + itos(subgroup_capabilities.max_size)); print_verbose(" stages: " + subgroup_capabilities.supported_stages_desc()); print_verbose(" supported ops: " + subgroup_capabilities.supported_operations_desc()); if (subgroup_capabilities.quadOperationsInAllStages) { diff --git a/drivers/vulkan/vulkan_context.h b/drivers/vulkan/vulkan_context.h index ef40aba9e1a..2ccfd137390 100644 --- a/drivers/vulkan/vulkan_context.h +++ b/drivers/vulkan/vulkan_context.h @@ -52,9 +52,12 @@ class VulkanContext { public: struct SubgroupCapabilities { uint32_t size; + uint32_t min_size; + uint32_t max_size; VkShaderStageFlags supportedStages; VkSubgroupFeatureFlags supportedOperations; VkBool32 quadOperationsInAllStages; + bool size_control_is_supported; uint32_t supported_stages_flags_rd() const; String supported_stages_desc() const; diff --git a/editor/plugins/node_3d_editor_plugin.cpp b/editor/plugins/node_3d_editor_plugin.cpp index 98ffcd11f6d..6b5ae0eeffb 100644 --- a/editor/plugins/node_3d_editor_plugin.cpp +++ b/editor/plugins/node_3d_editor_plugin.cpp @@ -3472,7 +3472,8 @@ void Node3DEditorViewport::_menu_option(int p_option) { case VIEW_DISPLAY_DEBUG_CLUSTER_DECALS: case VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES: case VIEW_DISPLAY_DEBUG_OCCLUDERS: - case VIEW_DISPLAY_MOTION_VECTORS: { + case VIEW_DISPLAY_MOTION_VECTORS: + case VIEW_DISPLAY_INTERNAL_BUFFER: { static const int display_options[] = { VIEW_DISPLAY_NORMAL, VIEW_DISPLAY_WIREFRAME, @@ -3500,6 +3501,7 @@ void Node3DEditorViewport::_menu_option(int p_option) { VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES, VIEW_DISPLAY_DEBUG_OCCLUDERS, VIEW_DISPLAY_MOTION_VECTORS, + VIEW_DISPLAY_INTERNAL_BUFFER, VIEW_MAX }; static const Viewport::DebugDraw debug_draw_modes[] = { @@ -3529,6 +3531,7 @@ void Node3DEditorViewport::_menu_option(int p_option) { Viewport::DEBUG_DRAW_CLUSTER_REFLECTION_PROBES, Viewport::DEBUG_DRAW_OCCLUDERS, Viewport::DEBUG_DRAW_MOTION_VECTORS, + Viewport::DEBUG_DRAW_INTERNAL_BUFFER, }; for (int idx = 0; display_options[idx] != VIEW_MAX; idx++) { @@ -5112,6 +5115,7 @@ Node3DEditorViewport::Node3DEditorViewport(Node3DEditor *p_spatial_editor, int p display_submenu->add_radio_check_item(TTR("ReflectionProbe Cluster"), VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES); display_submenu->add_radio_check_item(TTR("Occlusion Culling Buffer"), VIEW_DISPLAY_DEBUG_OCCLUDERS); display_submenu->add_radio_check_item(TTR("Motion Vectors"), VIEW_DISPLAY_MOTION_VECTORS); + display_submenu->add_radio_check_item(TTR("Internal Buffer"), VIEW_DISPLAY_INTERNAL_BUFFER); display_submenu->set_name("display_advanced"); view_menu->get_popup()->add_submenu_item(TTR("Display Advanced..."), "display_advanced", VIEW_DISPLAY_ADVANCED); diff --git a/editor/plugins/node_3d_editor_plugin.h b/editor/plugins/node_3d_editor_plugin.h index 2fb7804a677..8b066185c7a 100644 --- a/editor/plugins/node_3d_editor_plugin.h +++ b/editor/plugins/node_3d_editor_plugin.h @@ -156,6 +156,7 @@ class Node3DEditorViewport : public Control { VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES, VIEW_DISPLAY_DEBUG_OCCLUDERS, VIEW_DISPLAY_MOTION_VECTORS, + VIEW_DISPLAY_INTERNAL_BUFFER, VIEW_DISPLAY_MAX, // > Keep in sync with menu. diff --git a/scene/main/viewport.cpp b/scene/main/viewport.cpp index ad11c80c42d..3a4fe9a0595 100644 --- a/scene/main/viewport.cpp +++ b/scene/main/viewport.cpp @@ -4505,7 +4505,7 @@ void Viewport::_bind_methods() { #ifndef _3D_DISABLED ADD_GROUP("Scaling 3D", ""); - ADD_PROPERTY(PropertyInfo(Variant::INT, "scaling_3d_mode", PROPERTY_HINT_ENUM, "Bilinear (Fastest),FSR 1.0 (Fast)"), "set_scaling_3d_mode", "get_scaling_3d_mode"); + ADD_PROPERTY(PropertyInfo(Variant::INT, "scaling_3d_mode", PROPERTY_HINT_ENUM, "Bilinear (Fastest),FSR 1.0 (Fast),FSR 2.2 (Slow)"), "set_scaling_3d_mode", "get_scaling_3d_mode"); ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "scaling_3d_scale", PROPERTY_HINT_RANGE, "0.25,2.0,0.01"), "set_scaling_3d_scale", "get_scaling_3d_scale"); ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "texture_mipmap_bias", PROPERTY_HINT_RANGE, "-2,2,0.001"), "set_texture_mipmap_bias", "get_texture_mipmap_bias"); ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "fsr_sharpness", PROPERTY_HINT_RANGE, "0,2,0.1"), "set_fsr_sharpness", "get_fsr_sharpness"); @@ -4556,6 +4556,7 @@ void Viewport::_bind_methods() { BIND_ENUM_CONSTANT(SCALING_3D_MODE_BILINEAR); BIND_ENUM_CONSTANT(SCALING_3D_MODE_FSR); + BIND_ENUM_CONSTANT(SCALING_3D_MODE_FSR2); BIND_ENUM_CONSTANT(SCALING_3D_MODE_MAX); BIND_ENUM_CONSTANT(MSAA_DISABLED); @@ -4603,6 +4604,7 @@ void Viewport::_bind_methods() { BIND_ENUM_CONSTANT(DEBUG_DRAW_CLUSTER_REFLECTION_PROBES); BIND_ENUM_CONSTANT(DEBUG_DRAW_OCCLUDERS) BIND_ENUM_CONSTANT(DEBUG_DRAW_MOTION_VECTORS) + BIND_ENUM_CONSTANT(DEBUG_DRAW_INTERNAL_BUFFER); BIND_ENUM_CONSTANT(DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_NEAREST); BIND_ENUM_CONSTANT(DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_LINEAR); diff --git a/scene/main/viewport.h b/scene/main/viewport.h index 68084fc35f2..a32077a489e 100644 --- a/scene/main/viewport.h +++ b/scene/main/viewport.h @@ -98,6 +98,7 @@ public: enum Scaling3DMode { SCALING_3D_MODE_BILINEAR, SCALING_3D_MODE_FSR, + SCALING_3D_MODE_FSR2, SCALING_3D_MODE_MAX }; @@ -167,6 +168,7 @@ public: DEBUG_DRAW_CLUSTER_REFLECTION_PROBES, DEBUG_DRAW_OCCLUDERS, DEBUG_DRAW_MOTION_VECTORS, + DEBUG_DRAW_INTERNAL_BUFFER, }; enum DefaultCanvasItemTextureFilter { diff --git a/servers/rendering/renderer_rd/effects/SCsub b/servers/rendering/renderer_rd/effects/SCsub index 86681f9c74d..8e13715447c 100644 --- a/servers/rendering/renderer_rd/effects/SCsub +++ b/servers/rendering/renderer_rd/effects/SCsub @@ -2,4 +2,33 @@ Import("env") -env.add_source_files(env.servers_sources, "*.cpp") +env_effects = env.Clone() + +# Thirdparty source files + +thirdparty_obj = [] + +thirdparty_dir = "#thirdparty/amd-fsr2/" +thirdparty_sources = ["ffx_assert.cpp", "ffx_fsr2.cpp"] +thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources] + +env_effects.Prepend(CPPPATH=[thirdparty_dir]) + +# This flag doesn't actually control anything GCC specific in FSR2. It determines +# if symbols should be exported, which is not required for Godot. +env_effects.Append(CPPDEFINES=["FFX_GCC"]) + +env_thirdparty = env_effects.Clone() +env_thirdparty.disable_warnings() +env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources) +env.servers_sources += thirdparty_obj + +# Godot source files + +module_obj = [] + +env_effects.add_source_files(module_obj, "*.cpp") +env.servers_sources += module_obj + +# Needed to force rebuilding the module files when the thirdparty library is updated. +env.Depends(module_obj, thirdparty_obj) diff --git a/servers/rendering/renderer_rd/effects/copy_effects.cpp b/servers/rendering/renderer_rd/effects/copy_effects.cpp index 834653f5c25..aa1a87cdd7a 100644 --- a/servers/rendering/renderer_rd/effects/copy_effects.cpp +++ b/servers/rendering/renderer_rd/effects/copy_effects.cpp @@ -281,8 +281,8 @@ CopyEffects::CopyEffects(bool p_prefer_raster_effects) { ba.enable_blend = true; ba.src_color_blend_factor = RD::BLEND_FACTOR_ONE; ba.dst_color_blend_factor = RD::BLEND_FACTOR_ONE; - ba.src_alpha_blend_factor = RD::BLEND_FACTOR_ONE; - ba.dst_alpha_blend_factor = RD::BLEND_FACTOR_ONE; + ba.src_alpha_blend_factor = RD::BLEND_FACTOR_ZERO; + ba.dst_alpha_blend_factor = RD::BLEND_FACTOR_ZERO; ba.color_blend_op = RD::BLEND_OP_ADD; ba.alpha_blend_op = RD::BLEND_OP_ADD; diff --git a/servers/rendering/renderer_rd/effects/debug_effects.cpp b/servers/rendering/renderer_rd/effects/debug_effects.cpp index 357d035ae93..abcd9bbfae5 100644 --- a/servers/rendering/renderer_rd/effects/debug_effects.cpp +++ b/servers/rendering/renderer_rd/effects/debug_effects.cpp @@ -340,25 +340,38 @@ void DebugEffects::draw_shadow_frustum(RID p_light, const Projection &p_cam_proj } } -void DebugEffects::draw_motion_vectors(RID p_velocity, RID p_dest_fb, Size2i p_velocity_size) { +void DebugEffects::draw_motion_vectors(RID p_velocity, RID p_depth, RID p_dest_fb, const Projection &p_current_projection, const Transform3D &p_current_transform, const Projection &p_previous_projection, const Transform3D &p_previous_transform, Size2i p_resolution) { MaterialStorage *material_storage = MaterialStorage::get_singleton(); ERR_FAIL_NULL(material_storage); UniformSetCacheRD *uniform_set_cache = UniformSetCacheRD::get_singleton(); ERR_FAIL_NULL(uniform_set_cache); - RID default_sampler = material_storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_LINEAR, RS::CANVAS_ITEM_TEXTURE_REPEAT_DISABLED); + RID default_sampler = material_storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_NEAREST, RS::CANVAS_ITEM_TEXTURE_REPEAT_DISABLED); RD::Uniform u_source_velocity(RD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE, 0, Vector({ default_sampler, p_velocity })); + RD::Uniform u_source_depth(RD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE, 1, Vector({ default_sampler, p_depth })); RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_dest_fb, RD::INITIAL_ACTION_KEEP, RD::FINAL_ACTION_READ, RD::INITIAL_ACTION_DROP, RD::FINAL_ACTION_DISCARD); RD::get_singleton()->draw_list_bind_render_pipeline(draw_list, motion_vectors.pipeline.get_render_pipeline(RD::INVALID_ID, RD::get_singleton()->framebuffer_get_format(p_dest_fb), false, RD::get_singleton()->draw_list_get_current_pass())); - motion_vectors.push_constant.velocity_resolution[0] = p_velocity_size.width; - motion_vectors.push_constant.velocity_resolution[1] = p_velocity_size.height; + Projection reprojection = p_previous_projection.flipped_y() * p_previous_transform.affine_inverse() * p_current_transform * p_current_projection.flipped_y().inverse(); + RendererRD::MaterialStorage::store_camera(reprojection, motion_vectors.push_constant.reprojection_matrix); + + motion_vectors.push_constant.resolution[0] = p_resolution.width; + motion_vectors.push_constant.resolution[1] = p_resolution.height; + motion_vectors.push_constant.force_derive_from_depth = false; RID shader = motion_vectors.shader.version_get_shader(motion_vectors.shader_version, 0); - RD::get_singleton()->draw_list_bind_uniform_set(draw_list, uniform_set_cache->get_cache(shader, 0, u_source_velocity), 0); + RD::get_singleton()->draw_list_bind_uniform_set(draw_list, uniform_set_cache->get_cache(shader, 0, u_source_velocity, u_source_depth), 0); RD::get_singleton()->draw_list_set_push_constant(draw_list, &motion_vectors.push_constant, sizeof(MotionVectorsPushConstant)); RD::get_singleton()->draw_list_draw(draw_list, false, 1u, 3u); + +#ifdef DRAW_DERIVATION_FROM_DEPTH_ON_TOP + motion_vectors.push_constant.force_derive_from_depth = true; + + RD::get_singleton()->draw_list_set_push_constant(draw_list, &motion_vectors.push_constant, sizeof(MotionVectorsPushConstant)); + RD::get_singleton()->draw_list_draw(draw_list, false, 1u, 3u); +#endif + RD::get_singleton()->draw_list_end(); } diff --git a/servers/rendering/renderer_rd/effects/debug_effects.h b/servers/rendering/renderer_rd/effects/debug_effects.h index ae32d949120..b813d577e4c 100644 --- a/servers/rendering/renderer_rd/effects/debug_effects.h +++ b/servers/rendering/renderer_rd/effects/debug_effects.h @@ -72,8 +72,10 @@ private: } shadow_frustum; struct MotionVectorsPushConstant { - float velocity_resolution[2]; - float pad[2]; + float reprojection_matrix[16]; + float resolution[2]; + uint32_t force_derive_from_depth; + float pad; }; struct { @@ -91,7 +93,7 @@ public: ~DebugEffects(); void draw_shadow_frustum(RID p_light, const Projection &p_cam_projection, const Transform3D &p_cam_transform, RID p_dest_fb, const Rect2 p_rect); - void draw_motion_vectors(RID p_velocity, RID p_dest_fb, Size2i p_velocity_size); + void draw_motion_vectors(RID p_velocity, RID p_depth, RID p_dest_fb, const Projection &p_current_projection, const Transform3D &p_current_transform, const Projection &p_previous_projection, const Transform3D &p_previous_transform, Size2i p_resolution); }; } // namespace RendererRD diff --git a/servers/rendering/renderer_rd/effects/fsr2.cpp b/servers/rendering/renderer_rd/effects/fsr2.cpp new file mode 100644 index 00000000000..1fea1f97162 --- /dev/null +++ b/servers/rendering/renderer_rd/effects/fsr2.cpp @@ -0,0 +1,889 @@ +/**************************************************************************/ +/* fsr2.cpp */ +/**************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/**************************************************************************/ +/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ +/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/**************************************************************************/ + +#include "fsr2.h" + +#include "../storage_rd/material_storage.h" +#include "../uniform_set_cache_rd.h" + +using namespace RendererRD; + +#ifndef _MSC_VER +#include +#define wcscpy_s wcscpy +#endif + +static RD::TextureType ffx_resource_type_to_rd_texture_type(FfxResourceType p_type) { + switch (p_type) { + case FFX_RESOURCE_TYPE_TEXTURE1D: + return RD::TEXTURE_TYPE_1D; + case FFX_RESOURCE_TYPE_TEXTURE2D: + return RD::TEXTURE_TYPE_2D; + case FFX_RESOURCE_TYPE_TEXTURE3D: + return RD::TEXTURE_TYPE_3D; + default: + return RD::TEXTURE_TYPE_MAX; + } +} + +static FfxResourceType rd_texture_type_to_ffx_resource_type(RD::TextureType p_type) { + switch (p_type) { + case RD::TEXTURE_TYPE_1D: + return FFX_RESOURCE_TYPE_TEXTURE1D; + case RD::TEXTURE_TYPE_2D: + return FFX_RESOURCE_TYPE_TEXTURE2D; + case RD::TEXTURE_TYPE_3D: + return FFX_RESOURCE_TYPE_TEXTURE3D; + default: + return FFX_RESOURCE_TYPE_BUFFER; + } +} + +static RD::DataFormat ffx_surface_format_to_rd_format(FfxSurfaceFormat p_format) { + switch (p_format) { + case FFX_SURFACE_FORMAT_R32G32B32A32_TYPELESS: + return RD::DATA_FORMAT_R32G32B32A32_SFLOAT; + case FFX_SURFACE_FORMAT_R32G32B32A32_FLOAT: + return RD::DATA_FORMAT_R32G32B32A32_SFLOAT; + case FFX_SURFACE_FORMAT_R16G16B16A16_FLOAT: + return RD::DATA_FORMAT_R16G16B16A16_SFLOAT; + case FFX_SURFACE_FORMAT_R16G16B16A16_UNORM: + return RD::DATA_FORMAT_R16G16B16A16_UNORM; + case FFX_SURFACE_FORMAT_R32G32_FLOAT: + return RD::DATA_FORMAT_R32G32_SFLOAT; + case FFX_SURFACE_FORMAT_R32_UINT: + return RD::DATA_FORMAT_R32_UINT; + case FFX_SURFACE_FORMAT_R8G8B8A8_TYPELESS: + return RD::DATA_FORMAT_R8G8B8A8_UNORM; + case FFX_SURFACE_FORMAT_R8G8B8A8_UNORM: + return RD::DATA_FORMAT_R8G8B8A8_UNORM; + case FFX_SURFACE_FORMAT_R11G11B10_FLOAT: + return RD::DATA_FORMAT_B10G11R11_UFLOAT_PACK32; + case FFX_SURFACE_FORMAT_R16G16_FLOAT: + return RD::DATA_FORMAT_R16G16_SFLOAT; + case FFX_SURFACE_FORMAT_R16G16_UINT: + return RD::DATA_FORMAT_R16G16_UINT; + case FFX_SURFACE_FORMAT_R16_FLOAT: + return RD::DATA_FORMAT_R16_SFLOAT; + case FFX_SURFACE_FORMAT_R16_UINT: + return RD::DATA_FORMAT_R16_UINT; + case FFX_SURFACE_FORMAT_R16_UNORM: + return RD::DATA_FORMAT_R16_UNORM; + case FFX_SURFACE_FORMAT_R16_SNORM: + return RD::DATA_FORMAT_R16_SNORM; + case FFX_SURFACE_FORMAT_R8_UNORM: + return RD::DATA_FORMAT_R8_UNORM; + case FFX_SURFACE_FORMAT_R8_UINT: + return RD::DATA_FORMAT_R8_UINT; + case FFX_SURFACE_FORMAT_R8G8_UNORM: + return RD::DATA_FORMAT_R8G8_UNORM; + case FFX_SURFACE_FORMAT_R32_FLOAT: + return RD::DATA_FORMAT_R32_SFLOAT; + default: + return RD::DATA_FORMAT_MAX; + } +} + +static FfxSurfaceFormat rd_format_to_ffx_surface_format(RD::DataFormat p_format) { + switch (p_format) { + case RD::DATA_FORMAT_R32G32B32A32_SFLOAT: + return FFX_SURFACE_FORMAT_R32G32B32A32_FLOAT; + case RD::DATA_FORMAT_R16G16B16A16_SFLOAT: + return FFX_SURFACE_FORMAT_R16G16B16A16_FLOAT; + case RD::DATA_FORMAT_R16G16B16A16_UNORM: + return FFX_SURFACE_FORMAT_R16G16B16A16_UNORM; + case RD::DATA_FORMAT_R32G32_SFLOAT: + return FFX_SURFACE_FORMAT_R32G32_FLOAT; + case RD::DATA_FORMAT_R32_UINT: + return FFX_SURFACE_FORMAT_R32_UINT; + case RD::DATA_FORMAT_R8G8B8A8_UNORM: + return FFX_SURFACE_FORMAT_R8G8B8A8_UNORM; + case RD::DATA_FORMAT_B10G11R11_UFLOAT_PACK32: + return FFX_SURFACE_FORMAT_R11G11B10_FLOAT; + case RD::DATA_FORMAT_R16G16_SFLOAT: + return FFX_SURFACE_FORMAT_R16G16_FLOAT; + case RD::DATA_FORMAT_R16G16_UINT: + return FFX_SURFACE_FORMAT_R16G16_UINT; + case RD::DATA_FORMAT_R16_SFLOAT: + return FFX_SURFACE_FORMAT_R16_FLOAT; + case RD::DATA_FORMAT_R16_UINT: + return FFX_SURFACE_FORMAT_R16_UINT; + case RD::DATA_FORMAT_R16_UNORM: + return FFX_SURFACE_FORMAT_R16_UNORM; + case RD::DATA_FORMAT_R16_SNORM: + return FFX_SURFACE_FORMAT_R16_SNORM; + case RD::DATA_FORMAT_R8_UNORM: + return FFX_SURFACE_FORMAT_R8_UNORM; + case RD::DATA_FORMAT_R8_UINT: + return FFX_SURFACE_FORMAT_R8_UINT; + case RD::DATA_FORMAT_R8G8_UNORM: + return FFX_SURFACE_FORMAT_R8G8_UNORM; + case RD::DATA_FORMAT_R32_SFLOAT: + return FFX_SURFACE_FORMAT_R32_FLOAT; + default: + return FFX_SURFACE_FORMAT_UNKNOWN; + } +} + +static uint32_t ffx_usage_to_rd_usage_flags(uint32_t p_flags) { + uint32_t ret = RD::TEXTURE_USAGE_SAMPLING_BIT | RD::TEXTURE_USAGE_CAN_UPDATE_BIT; + + if (p_flags & FFX_RESOURCE_USAGE_RENDERTARGET) { + ret |= RD::TEXTURE_USAGE_COLOR_ATTACHMENT_BIT; + } + + if (p_flags & FFX_RESOURCE_USAGE_UAV) { + ret |= RD::TEXTURE_USAGE_STORAGE_BIT; + ret |= RD::TEXTURE_USAGE_CAN_COPY_FROM_BIT; + ret |= RD::TEXTURE_USAGE_CAN_COPY_TO_BIT; + } + + return ret; +} + +static FfxErrorCode create_backend_context_rd(FfxFsr2Interface *p_backend_interface, FfxDevice p_device) { + FSR2Context::Scratch &scratch = *reinterpret_cast(p_backend_interface->scratchBuffer); + + // Store pointer to the device common to all contexts. + scratch.device = p_device; + + // Create a ring buffer of uniform buffers. + // FIXME: This could be optimized to be a single memory block if it was possible for RD to create views into a particular memory range of a UBO. + for (uint32_t i = 0; i < FSR2_UBO_RING_BUFFER_SIZE; i++) { + scratch.ubo_ring_buffer[i] = RD::get_singleton()->uniform_buffer_create(FFX_MAX_CONST_SIZE * sizeof(uint32_t)); + ERR_FAIL_COND_V(scratch.ubo_ring_buffer[i].is_null(), FFX_ERROR_BACKEND_API_ERROR); + } + + return FFX_OK; +} + +static FfxErrorCode get_device_capabilities_rd(FfxFsr2Interface *p_backend_interface, FfxDeviceCapabilities *p_out_device_capabilities, FfxDevice p_device) { + FSR2Effect::Device &effect_device = *reinterpret_cast(p_device); + + *p_out_device_capabilities = effect_device.capabilities; + + return FFX_OK; +} + +static FfxErrorCode destroy_backend_context_rd(FfxFsr2Interface *p_backend_interface) { + FSR2Context::Scratch &scratch = *reinterpret_cast(p_backend_interface->scratchBuffer); + + for (uint32_t i = 0; i < FSR2_UBO_RING_BUFFER_SIZE; i++) { + RD::get_singleton()->free(scratch.ubo_ring_buffer[i]); + } + + return FFX_OK; +} + +static FfxErrorCode create_resource_rd(FfxFsr2Interface *p_backend_interface, const FfxCreateResourceDescription *p_create_resource_description, FfxResourceInternal *p_out_resource) { + // FSR2's base implementation won't issue a call to create a heap type that isn't just default on its own, + // so we can safely ignore it as RD does not expose this concept. + ERR_FAIL_COND_V(p_create_resource_description->heapType != FFX_HEAP_TYPE_DEFAULT, FFX_ERROR_INVALID_ARGUMENT); + + RenderingDevice *rd = RD::get_singleton(); + FSR2Context::Scratch &scratch = *reinterpret_cast(p_backend_interface->scratchBuffer); + FfxResourceDescription res_desc = p_create_resource_description->resourceDescription; + + // FSR2's base implementation never requests buffer creation. + ERR_FAIL_COND_V(res_desc.type != FFX_RESOURCE_TYPE_TEXTURE1D && res_desc.type != FFX_RESOURCE_TYPE_TEXTURE2D && res_desc.type != FFX_RESOURCE_TYPE_TEXTURE3D, FFX_ERROR_INVALID_ARGUMENT); + + if (res_desc.mipCount == 0) { + // Mipmap count must be derived from the resource's dimensions. + res_desc.mipCount = uint32_t(1 + floor(log2(MAX(MAX(res_desc.width, res_desc.height), res_desc.depth)))); + } + + Vector initial_data; + if (p_create_resource_description->initDataSize) { + PackedByteArray byte_array; + byte_array.resize(p_create_resource_description->initDataSize); + memcpy(byte_array.ptrw(), p_create_resource_description->initData, p_create_resource_description->initDataSize); + initial_data.push_back(byte_array); + } + + RD::TextureFormat texture_format; + texture_format.texture_type = ffx_resource_type_to_rd_texture_type(res_desc.type); + texture_format.format = ffx_surface_format_to_rd_format(res_desc.format); + texture_format.usage_bits = ffx_usage_to_rd_usage_flags(p_create_resource_description->usage); + texture_format.width = res_desc.width; + texture_format.height = res_desc.height; + texture_format.depth = res_desc.depth; + texture_format.mipmaps = res_desc.mipCount; + + RID texture = rd->texture_create(texture_format, RD::TextureView(), initial_data); + ERR_FAIL_COND_V(texture.is_null(), FFX_ERROR_BACKEND_API_ERROR); + + rd->set_resource_name(texture, String(p_create_resource_description->name)); + + // Add the resource to the storage and use the internal index to reference it. + p_out_resource->internalIndex = scratch.resources.add(texture, false, p_create_resource_description->id, res_desc); + + return FFX_OK; +} + +static FfxErrorCode register_resource_rd(FfxFsr2Interface *p_backend_interface, const FfxResource *p_in_resource, FfxResourceInternal *p_out_resource) { + if (p_in_resource->resource == nullptr) { + // Null resource case. + p_out_resource->internalIndex = -1; + return FFX_OK; + } + + FSR2Context::Scratch &scratch = *reinterpret_cast(p_backend_interface->scratchBuffer); + const RID &rid = *reinterpret_cast(p_in_resource->resource); + ERR_FAIL_COND_V(rid.is_null(), FFX_ERROR_INVALID_ARGUMENT); + + // Add the resource to the storage and use the internal index to reference it. + p_out_resource->internalIndex = scratch.resources.add(rid, true, FSR2Context::RESOURCE_ID_DYNAMIC, p_in_resource->description); + + return FFX_OK; +} + +static FfxErrorCode unregister_resources_rd(FfxFsr2Interface *p_backend_interface) { + FSR2Context::Scratch &scratch = *reinterpret_cast(p_backend_interface->scratchBuffer); + LocalVector dynamic_list_copy = scratch.resources.dynamic_list; + for (uint32_t i : dynamic_list_copy) { + scratch.resources.remove(i); + } + + return FFX_OK; +} + +static FfxResourceDescription get_resource_description_rd(FfxFsr2Interface *p_backend_interface, FfxResourceInternal p_resource) { + if (p_resource.internalIndex != -1) { + FSR2Context::Scratch &scratch = *reinterpret_cast(p_backend_interface->scratchBuffer); + return scratch.resources.descriptions[p_resource.internalIndex]; + } else { + return {}; + } +} + +static FfxErrorCode destroy_resource_rd(FfxFsr2Interface *p_backend_interface, FfxResourceInternal p_resource) { + if (p_resource.internalIndex != -1) { + FSR2Context::Scratch &scratch = *reinterpret_cast(p_backend_interface->scratchBuffer); + if (scratch.resources.rids[p_resource.internalIndex].is_valid()) { + RD::get_singleton()->free(scratch.resources.rids[p_resource.internalIndex]); + scratch.resources.remove(p_resource.internalIndex); + } + } + + return FFX_OK; +} + +static FfxErrorCode create_pipeline_rd(FfxFsr2Interface *p_backend_interface, FfxFsr2Pass p_pass, const FfxPipelineDescription *p_pipeline_description, FfxPipelineState *p_out_pipeline) { + FSR2Context::Scratch &scratch = *reinterpret_cast(p_backend_interface->scratchBuffer); + FSR2Effect::Device &device = *reinterpret_cast(scratch.device); + FSR2Effect::Pass &effect_pass = device.passes[p_pass]; + + if (effect_pass.pipeline.pipeline_rid.is_null()) { + // Create pipeline for the device if it hasn't been created yet. + effect_pass.root_signature.shader_rid = effect_pass.shader->version_get_shader(effect_pass.shader_version, effect_pass.shader_variant); + ERR_FAIL_COND_V(effect_pass.root_signature.shader_rid.is_null(), FFX_ERROR_BACKEND_API_ERROR); + + effect_pass.pipeline.pipeline_rid = RD::get_singleton()->compute_pipeline_create(effect_pass.root_signature.shader_rid); + ERR_FAIL_COND_V(effect_pass.pipeline.pipeline_rid.is_null(), FFX_ERROR_BACKEND_API_ERROR); + } + + // While this is not their intended use, we use the pipeline and root signature pointers to store the + // RIDs to the pipeline and shader that RD needs for the compute pipeline. + p_out_pipeline->pipeline = reinterpret_cast(&effect_pass.pipeline); + p_out_pipeline->rootSignature = reinterpret_cast(&effect_pass.root_signature); + + p_out_pipeline->srvCount = effect_pass.sampled_bindings.size(); + ERR_FAIL_COND_V(p_out_pipeline->srvCount > FFX_MAX_NUM_SRVS, FFX_ERROR_OUT_OF_RANGE); + memcpy(p_out_pipeline->srvResourceBindings, effect_pass.sampled_bindings.ptr(), sizeof(FfxResourceBinding) * p_out_pipeline->srvCount); + + p_out_pipeline->uavCount = effect_pass.storage_bindings.size(); + ERR_FAIL_COND_V(p_out_pipeline->uavCount > FFX_MAX_NUM_UAVS, FFX_ERROR_OUT_OF_RANGE); + memcpy(p_out_pipeline->uavResourceBindings, effect_pass.storage_bindings.ptr(), sizeof(FfxResourceBinding) * p_out_pipeline->uavCount); + + p_out_pipeline->constCount = effect_pass.uniform_bindings.size(); + ERR_FAIL_COND_V(p_out_pipeline->constCount > FFX_MAX_NUM_CONST_BUFFERS, FFX_ERROR_OUT_OF_RANGE); + memcpy(p_out_pipeline->cbResourceBindings, effect_pass.uniform_bindings.ptr(), sizeof(FfxResourceBinding) * p_out_pipeline->constCount); + + bool low_resolution_mvs = (p_pipeline_description->contextFlags & FFX_FSR2_ENABLE_DISPLAY_RESOLUTION_MOTION_VECTORS) == 0; + + if (p_pass == FFX_FSR2_PASS_ACCUMULATE || p_pass == FFX_FSR2_PASS_ACCUMULATE_SHARPEN) { + // Change the binding for motion vectors in this particular pass if low resolution MVs are used. + if (low_resolution_mvs) { + FfxResourceBinding &binding = p_out_pipeline->srvResourceBindings[2]; + wcscpy_s(binding.name, L"r_dilated_motion_vectors"); + } + } + + return FFX_OK; +} + +static FfxErrorCode destroy_pipeline_rd(FfxFsr2Interface *p_backend_interface, FfxPipelineState *p_pipeline) { + // We don't want to destroy pipelines when the FSR2 API deems it necessary as it'll do so whenever the context is destroyed. + + return FFX_OK; +} + +static FfxErrorCode schedule_gpu_job_rd(FfxFsr2Interface *p_backend_interface, const FfxGpuJobDescription *p_job) { + ERR_FAIL_NULL_V(p_backend_interface, FFX_ERROR_INVALID_ARGUMENT); + ERR_FAIL_NULL_V(p_job, FFX_ERROR_INVALID_ARGUMENT); + + FSR2Context::Scratch &scratch = *reinterpret_cast(p_backend_interface->scratchBuffer); + scratch.gpu_jobs.push_back(*p_job); + + return FFX_OK; +} + +static FfxErrorCode execute_gpu_job_clear_float_rd(FSR2Context::Scratch &p_scratch, const FfxClearFloatJobDescription &p_job) { + RID resource = p_scratch.resources.rids[p_job.target.internalIndex]; + FfxResourceDescription &desc = p_scratch.resources.descriptions[p_job.target.internalIndex]; + + ERR_FAIL_COND_V(desc.type == FFX_RESOURCE_TYPE_BUFFER, FFX_ERROR_INVALID_ARGUMENT); + + Color color(p_job.color[0], p_job.color[1], p_job.color[2], p_job.color[3]); + RD::get_singleton()->texture_clear(resource, color, 0, desc.mipCount, 0, 1); + + return FFX_OK; +} + +static FfxErrorCode execute_gpu_job_copy_rd(FSR2Context::Scratch &p_scratch, const FfxCopyJobDescription &p_job) { + RID src = p_scratch.resources.rids[p_job.src.internalIndex]; + RID dst = p_scratch.resources.rids[p_job.dst.internalIndex]; + FfxResourceDescription &src_desc = p_scratch.resources.descriptions[p_job.src.internalIndex]; + FfxResourceDescription &dst_desc = p_scratch.resources.descriptions[p_job.dst.internalIndex]; + + ERR_FAIL_COND_V(src_desc.type == FFX_RESOURCE_TYPE_BUFFER, FFX_ERROR_INVALID_ARGUMENT); + ERR_FAIL_COND_V(dst_desc.type == FFX_RESOURCE_TYPE_BUFFER, FFX_ERROR_INVALID_ARGUMENT); + + for (uint32_t mip_level = 0; mip_level < src_desc.mipCount; mip_level++) { + // Only push the barriers on the last copy. + // FIXME: This could be optimized if RenderingDevice was able to copy multiple mip levels in a single command. + BitField post_barrier = (mip_level == (src_desc.mipCount - 1)) ? RD::BARRIER_MASK_ALL_BARRIERS : RD::BARRIER_MASK_NO_BARRIER; + RD::get_singleton()->texture_copy(src, dst, Vector3(0, 0, 0), Vector3(0, 0, 0), Vector3(src_desc.width, src_desc.height, src_desc.depth), mip_level, mip_level, 0, 0, post_barrier); + } + + return FFX_OK; +} + +static FfxErrorCode execute_gpu_job_compute_rd(FSR2Context::Scratch &p_scratch, const FfxComputeJobDescription &p_job) { + UniformSetCacheRD *uniform_set_cache = UniformSetCacheRD::get_singleton(); + ERR_FAIL_NULL_V(uniform_set_cache, FFX_ERROR_BACKEND_API_ERROR); + + FSR2Effect::RootSignature &root_signature = *reinterpret_cast(p_job.pipeline.rootSignature); + ERR_FAIL_COND_V(root_signature.shader_rid.is_null(), FFX_ERROR_INVALID_ARGUMENT); + + FSR2Effect::Pipeline &backend_pipeline = *reinterpret_cast(p_job.pipeline.pipeline); + ERR_FAIL_COND_V(backend_pipeline.pipeline_rid.is_null(), FFX_ERROR_INVALID_ARGUMENT); + + Vector compute_uniforms; + for (uint32_t i = 0; i < p_job.pipeline.srvCount; i++) { + RID texture_rid = p_scratch.resources.rids[p_job.srvs[i].internalIndex]; + RD::Uniform texture_uniform(RD::UNIFORM_TYPE_TEXTURE, p_job.pipeline.srvResourceBindings[i].slotIndex, texture_rid); + compute_uniforms.push_back(texture_uniform); + } + + for (uint32_t i = 0; i < p_job.pipeline.uavCount; i++) { + RID image_rid = p_scratch.resources.rids[p_job.uavs[i].internalIndex]; + RD::Uniform storage_uniform; + storage_uniform.uniform_type = RD::UNIFORM_TYPE_IMAGE; + storage_uniform.binding = p_job.pipeline.uavResourceBindings[i].slotIndex; + + if (p_job.uavMip[i] > 0) { + LocalVector &mip_slice_rids = p_scratch.resources.mip_slice_rids[p_job.uavs[i].internalIndex]; + if (mip_slice_rids.is_empty()) { + mip_slice_rids.resize(p_scratch.resources.descriptions[p_job.uavs[i].internalIndex].mipCount); + } + + ERR_FAIL_COND_V(p_job.uavMip[i] >= mip_slice_rids.size(), FFX_ERROR_INVALID_ARGUMENT); + + if (mip_slice_rids[p_job.uavMip[i]].is_null()) { + mip_slice_rids[p_job.uavMip[i]] = RD::get_singleton()->texture_create_shared_from_slice(RD::TextureView(), image_rid, 0, p_job.uavMip[i]); + } + + ERR_FAIL_COND_V(mip_slice_rids[p_job.uavMip[i]].is_null(), FFX_ERROR_BACKEND_API_ERROR); + + storage_uniform.append_id(mip_slice_rids[p_job.uavMip[i]]); + } else { + storage_uniform.append_id(image_rid); + } + + compute_uniforms.push_back(storage_uniform); + } + + for (uint32_t i = 0; i < p_job.pipeline.constCount; i++) { + RID buffer_rid = p_scratch.ubo_ring_buffer[p_scratch.ubo_ring_buffer_index]; + p_scratch.ubo_ring_buffer_index = (p_scratch.ubo_ring_buffer_index + 1) % FSR2_UBO_RING_BUFFER_SIZE; + + BitField post_barrier = (i == (p_job.pipeline.constCount - 1)) ? RD::BARRIER_MASK_ALL_BARRIERS : RD::BARRIER_MASK_NO_BARRIER; + RD::get_singleton()->buffer_update(buffer_rid, 0, p_job.cbs[i].uint32Size * sizeof(uint32_t), p_job.cbs[i].data, post_barrier); + + RD::Uniform buffer_uniform(RD::UNIFORM_TYPE_UNIFORM_BUFFER, p_job.pipeline.cbResourceBindings[i].slotIndex, buffer_rid); + compute_uniforms.push_back(buffer_uniform); + } + + FSR2Effect::Device &device = *reinterpret_cast(p_scratch.device); + RD::Uniform u_point_clamp_sampler(RD::UniformType::UNIFORM_TYPE_SAMPLER, 0, device.point_clamp_sampler); + RD::Uniform u_linear_clamp_sampler(RD::UniformType::UNIFORM_TYPE_SAMPLER, 1, device.linear_clamp_sampler); + + RD::ComputeListID compute_list = RD::get_singleton()->compute_list_begin(); + RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, backend_pipeline.pipeline_rid); + RD::get_singleton()->compute_list_bind_uniform_set(compute_list, uniform_set_cache->get_cache(root_signature.shader_rid, 0, u_point_clamp_sampler, u_linear_clamp_sampler), 0); + RD::get_singleton()->compute_list_bind_uniform_set(compute_list, uniform_set_cache->get_cache_vec(root_signature.shader_rid, 1, compute_uniforms), 1); + RD::get_singleton()->compute_list_dispatch(compute_list, p_job.dimensions[0], p_job.dimensions[1], p_job.dimensions[2]); + RD::get_singleton()->compute_list_end(); + + return FFX_OK; +} + +static FfxErrorCode execute_gpu_jobs_rd(FfxFsr2Interface *p_backend_interface, FfxCommandList p_command_list) { + ERR_FAIL_NULL_V(p_backend_interface, FFX_ERROR_INVALID_ARGUMENT); + + FSR2Context::Scratch &scratch = *reinterpret_cast(p_backend_interface->scratchBuffer); + FfxErrorCode error_code = FFX_OK; + for (const FfxGpuJobDescription &job : scratch.gpu_jobs) { + switch (job.jobType) { + case FFX_GPU_JOB_CLEAR_FLOAT: { + error_code = execute_gpu_job_clear_float_rd(scratch, job.clearJobDescriptor); + } break; + case FFX_GPU_JOB_COPY: { + error_code = execute_gpu_job_copy_rd(scratch, job.copyJobDescriptor); + } break; + case FFX_GPU_JOB_COMPUTE: { + error_code = execute_gpu_job_compute_rd(scratch, job.computeJobDescriptor); + } break; + default: { + error_code = FFX_ERROR_INVALID_ARGUMENT; + } break; + } + + if (error_code != FFX_OK) { + scratch.gpu_jobs.clear(); + return error_code; + } + } + + scratch.gpu_jobs.clear(); + + return FFX_OK; +} + +static FfxResource get_resource_rd(RID *p_rid, const wchar_t *p_name) { + FfxResource res = {}; + if (p_rid->is_null()) { + return res; + } + + wcscpy_s(res.name, p_name); + + RD::TextureFormat texture_format = RD::get_singleton()->texture_get_format(*p_rid); + res.description.type = rd_texture_type_to_ffx_resource_type(texture_format.texture_type); + res.description.format = rd_format_to_ffx_surface_format(texture_format.format); + res.description.width = texture_format.width; + res.description.height = texture_format.height; + res.description.depth = texture_format.depth; + res.description.mipCount = texture_format.mipmaps; + res.description.flags = FFX_RESOURCE_FLAGS_NONE; + res.resource = reinterpret_cast(p_rid); + res.isDepth = texture_format.usage_bits & RD::TEXTURE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT; + + return res; +} + +FSR2Context::~FSR2Context() { + ffxFsr2ContextDestroy(&fsr_context); +} + +FSR2Effect::FSR2Effect() { + FfxDeviceCapabilities &capabilities = device.capabilities; + uint64_t default_subgroup_size = RD::get_singleton()->limit_get(RD::LIMIT_SUBGROUP_SIZE); + capabilities.minimumSupportedShaderModel = FFX_SHADER_MODEL_5_1; + capabilities.waveLaneCountMin = RD::get_singleton()->limit_get(RD::LIMIT_SUBGROUP_MIN_SIZE); + capabilities.waveLaneCountMax = RD::get_singleton()->limit_get(RD::LIMIT_SUBGROUP_MAX_SIZE); + capabilities.fp16Supported = RD::get_singleton()->has_feature(RD::Features::SUPPORTS_FSR_HALF_FLOAT); + capabilities.raytracingSupported = false; + + bool force_wave_64 = default_subgroup_size == 32 && capabilities.waveLaneCountMax == 64; + bool use_lut = force_wave_64 || default_subgroup_size == 64; + + String general_defines_base = + "\n#define FFX_GPU\n" + "\n#define FFX_GLSL 1\n" + "\n#define FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS 1\n" + "\n#define FFX_FSR2_OPTION_HDR_COLOR_INPUT 1\n" + "\n#define FFX_FSR2_OPTION_GODOT_REACTIVE_MASK_CLAMP 1\n" + "\n#define FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS 1\n"; + + if (use_lut) { + general_defines_base += "\n#define FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE 1\n"; + } + + String general_defines = general_defines_base; + if (capabilities.fp16Supported) { + general_defines += "\n#define FFX_HALF 1\n"; + } + + Vector modes; + modes.push_back(""); + + // Since Godot currently lacks a shader reflection mechanism to persist the name of the bindings in the shader cache and + // there's also no mechanism to compile the shaders offline, the bindings are created manually by looking at the GLSL + // files included in FSR2 and mapping the macro bindings (#define FSR2_BIND_*) to their respective implementation names. + // + // It is not guaranteed these will remain consistent at all between versions of FSR2, so it'll be necessary to keep these + // bindings up to date whenever the library is updated. In such cases, it is very likely the validation layer will throw an + // error if the bindings do not match. + + { + Pass &pass = device.passes[FFX_FSR2_PASS_DEPTH_CLIP]; + pass.shader = &shaders.depth_clip; + pass.shader->initialize(modes, general_defines); + pass.shader_version = pass.shader->version_create(); + + pass.sampled_bindings = { + FfxResourceBinding{ 0, 0, L"r_reconstructed_previous_nearest_depth" }, + FfxResourceBinding{ 1, 0, L"r_dilated_motion_vectors" }, + FfxResourceBinding{ 2, 0, L"r_dilatedDepth" }, + FfxResourceBinding{ 3, 0, L"r_reactive_mask" }, + FfxResourceBinding{ 4, 0, L"r_transparency_and_composition_mask" }, + FfxResourceBinding{ 5, 0, L"r_prepared_input_color" }, + FfxResourceBinding{ 6, 0, L"r_previous_dilated_motion_vectors" }, + FfxResourceBinding{ 7, 0, L"r_input_motion_vectors" }, + FfxResourceBinding{ 8, 0, L"r_input_color_jittered" }, + FfxResourceBinding{ 9, 0, L"r_input_depth" }, + FfxResourceBinding{ 10, 0, L"r_input_exposure" } + }; + + pass.storage_bindings = { + // FSR2_BIND_UAV_DEPTH_CLIP (11) does not point to anything. + FfxResourceBinding{ 12, 0, L"rw_dilated_reactive_masks" }, + FfxResourceBinding{ 13, 0, L"rw_prepared_input_color" } + }; + + pass.uniform_bindings = { + FfxResourceBinding{ 14, 0, L"cbFSR2" } + }; + } + + { + Pass &pass = device.passes[FFX_FSR2_PASS_RECONSTRUCT_PREVIOUS_DEPTH]; + pass.shader = &shaders.reconstruct_previous_depth; + pass.shader->initialize(modes, general_defines); + pass.shader_version = pass.shader->version_create(); + + pass.sampled_bindings = { + FfxResourceBinding{ 0, 0, L"r_input_motion_vectors" }, + FfxResourceBinding{ 1, 0, L"r_input_depth" }, + FfxResourceBinding{ 2, 0, L"r_input_color_jittered" }, + FfxResourceBinding{ 3, 0, L"r_input_exposure" }, + FfxResourceBinding{ 4, 0, L"r_luma_history" } + }; + + pass.storage_bindings = { + FfxResourceBinding{ 5, 0, L"rw_reconstructed_previous_nearest_depth" }, + FfxResourceBinding{ 6, 0, L"rw_dilated_motion_vectors" }, + FfxResourceBinding{ 7, 0, L"rw_dilatedDepth" }, + FfxResourceBinding{ 8, 0, L"rw_prepared_input_color" }, + FfxResourceBinding{ 9, 0, L"rw_luma_history" }, + // FSR2_BIND_UAV_LUMA_INSTABILITY (10) does not point to anything. + FfxResourceBinding{ 11, 0, L"rw_lock_input_luma" } + }; + + pass.uniform_bindings = { + FfxResourceBinding{ 12, 0, L"cbFSR2" } + }; + } + + { + Pass &pass = device.passes[FFX_FSR2_PASS_LOCK]; + pass.shader = &shaders.lock; + pass.shader->initialize(modes, general_defines); + pass.shader_version = pass.shader->version_create(); + + pass.sampled_bindings = { + FfxResourceBinding{ 0, 0, L"r_lock_input_luma" } + }; + + pass.storage_bindings = { + FfxResourceBinding{ 1, 0, L"rw_new_locks" }, + FfxResourceBinding{ 2, 0, L"rw_reconstructed_previous_nearest_depth" } + }; + + pass.uniform_bindings = { + FfxResourceBinding{ 3, 0, L"cbFSR2" } + }; + } + + { + Vector accumulate_modes; + accumulate_modes.push_back("\n"); + accumulate_modes.push_back("\n#define FFX_FSR2_OPTION_APPLY_SHARPENING 1\n"); + + String general_defines_accumulate; + if (RD::get_singleton()->get_device_vendor_name() == "NVIDIA") { + // Workaround: Disable FP16 path for the accumulate pass on NVIDIA due to reduced occupancy and high VRAM throughput. + general_defines_accumulate = general_defines_base; + } else { + general_defines_accumulate = general_defines; + } + + Pass &pass = device.passes[FFX_FSR2_PASS_ACCUMULATE]; + pass.shader = &shaders.accumulate; + pass.shader->initialize(accumulate_modes, general_defines_accumulate); + pass.shader_version = pass.shader->version_create(); + + pass.sampled_bindings = { + FfxResourceBinding{ 0, 0, L"r_input_exposure" }, + FfxResourceBinding{ 1, 0, L"r_dilated_reactive_masks" }, + FfxResourceBinding{ 2, 0, L"r_input_motion_vectors" }, + FfxResourceBinding{ 3, 0, L"r_internal_upscaled_color" }, + FfxResourceBinding{ 4, 0, L"r_lock_status" }, + FfxResourceBinding{ 5, 0, L"r_input_depth" }, + FfxResourceBinding{ 6, 0, L"r_prepared_input_color" }, + // FSR2_BIND_SRV_LUMA_INSTABILITY(7) does not point to anything. + FfxResourceBinding{ 8, 0, L"r_lanczos_lut" }, + FfxResourceBinding{ 9, 0, L"r_upsample_maximum_bias_lut" }, + FfxResourceBinding{ 10, 0, L"r_imgMips" }, + FfxResourceBinding{ 11, 0, L"r_auto_exposure" }, + FfxResourceBinding{ 12, 0, L"r_luma_history" } + }; + + pass.storage_bindings = { + FfxResourceBinding{ 13, 0, L"rw_internal_upscaled_color" }, + FfxResourceBinding{ 14, 0, L"rw_lock_status" }, + FfxResourceBinding{ 15, 0, L"rw_upscaled_output" }, + FfxResourceBinding{ 16, 0, L"rw_new_locks" }, + FfxResourceBinding{ 17, 0, L"rw_luma_history" } + }; + + pass.uniform_bindings = { + FfxResourceBinding{ 18, 0, L"cbFSR2" } + }; + + // Sharpen pass is a clone of the accumulate pass. + Pass &sharpen_pass = device.passes[FFX_FSR2_PASS_ACCUMULATE_SHARPEN]; + sharpen_pass = pass; + sharpen_pass.shader_variant = 1; + } + + { + Pass &pass = device.passes[FFX_FSR2_PASS_RCAS]; + pass.shader = &shaders.rcas; + pass.shader->initialize(modes, general_defines_base); + pass.shader_version = pass.shader->version_create(); + + pass.sampled_bindings = { + FfxResourceBinding{ 0, 0, L"r_input_exposure" }, + FfxResourceBinding{ 1, 0, L"r_rcas_input" } + }; + + pass.storage_bindings = { + FfxResourceBinding{ 2, 0, L"rw_upscaled_output" } + }; + + pass.uniform_bindings = { + FfxResourceBinding{ 3, 0, L"cbFSR2" }, + FfxResourceBinding{ 4, 0, L"cbRCAS" } + }; + } + + { + Pass &pass = device.passes[FFX_FSR2_PASS_COMPUTE_LUMINANCE_PYRAMID]; + pass.shader = &shaders.compute_luminance_pyramid; + pass.shader->initialize(modes, general_defines_base); + pass.shader_version = pass.shader->version_create(); + + pass.sampled_bindings = { + FfxResourceBinding{ 0, 0, L"r_input_color_jittered" } + }; + + pass.storage_bindings = { + FfxResourceBinding{ 1, 0, L"rw_spd_global_atomic" }, + FfxResourceBinding{ 2, 0, L"rw_img_mip_shading_change" }, + FfxResourceBinding{ 3, 0, L"rw_img_mip_5" }, + FfxResourceBinding{ 4, 0, L"rw_auto_exposure" } + }; + + pass.uniform_bindings = { + FfxResourceBinding{ 5, 0, L"cbFSR2" }, + FfxResourceBinding{ 6, 0, L"cbSPD" } + }; + } + + { + Pass &pass = device.passes[FFX_FSR2_PASS_GENERATE_REACTIVE]; + pass.shader = &shaders.autogen_reactive; + pass.shader->initialize(modes, general_defines); + pass.shader_version = pass.shader->version_create(); + + pass.sampled_bindings = { + FfxResourceBinding{ 0, 0, L"r_input_opaque_only" }, + FfxResourceBinding{ 1, 0, L"r_input_color_jittered" } + }; + + pass.storage_bindings = { + FfxResourceBinding{ 2, 0, L"rw_output_autoreactive" } + }; + + pass.uniform_bindings = { + FfxResourceBinding{ 3, 0, L"cbGenerateReactive" }, + FfxResourceBinding{ 4, 0, L"cbFSR2" } + }; + } + + { + Pass &pass = device.passes[FFX_FSR2_PASS_TCR_AUTOGENERATE]; + pass.shader = &shaders.tcr_autogen; + pass.shader->initialize(modes, general_defines); + pass.shader_version = pass.shader->version_create(); + + pass.sampled_bindings = { + FfxResourceBinding{ 0, 0, L"r_input_opaque_only" }, + FfxResourceBinding{ 1, 0, L"r_input_color_jittered" }, + FfxResourceBinding{ 2, 0, L"r_input_motion_vectors" }, + FfxResourceBinding{ 3, 0, L"r_input_prev_color_pre_alpha" }, + FfxResourceBinding{ 4, 0, L"r_input_prev_color_post_alpha" }, + FfxResourceBinding{ 5, 0, L"r_reactive_mask" }, + FfxResourceBinding{ 6, 0, L"r_transparency_and_composition_mask" }, + FfxResourceBinding{ 13, 0, L"r_input_depth" } + }; + + pass.storage_bindings = { + FfxResourceBinding{ 7, 0, L"rw_output_autoreactive" }, + FfxResourceBinding{ 8, 0, L"rw_output_autocomposition" }, + FfxResourceBinding{ 9, 0, L"rw_output_prev_color_pre_alpha" }, + FfxResourceBinding{ 10, 0, L"rw_output_prev_color_post_alpha" } + }; + + pass.uniform_bindings = { + FfxResourceBinding{ 11, 0, L"cbFSR2" }, + FfxResourceBinding{ 12, 0, L"cbGenerateReactive" } + }; + } + + RD::SamplerState state; + state.mag_filter = RD::SAMPLER_FILTER_NEAREST; + state.min_filter = RD::SAMPLER_FILTER_NEAREST; + state.repeat_u = RD::SAMPLER_REPEAT_MODE_CLAMP_TO_EDGE; + state.repeat_v = RD::SAMPLER_REPEAT_MODE_CLAMP_TO_EDGE; + state.repeat_w = RD::SAMPLER_REPEAT_MODE_CLAMP_TO_EDGE; + state.min_lod = -1000.0f; + state.max_lod = 1000.0f; + state.anisotropy_max = 1.0; + device.point_clamp_sampler = RD::get_singleton()->sampler_create(state); + ERR_FAIL_COND(device.point_clamp_sampler.is_null()); + + state.mag_filter = RD::SAMPLER_FILTER_LINEAR; + state.min_filter = RD::SAMPLER_FILTER_LINEAR; + device.linear_clamp_sampler = RD::get_singleton()->sampler_create(state); + ERR_FAIL_COND(device.linear_clamp_sampler.is_null()); +} + +FSR2Effect::~FSR2Effect() { + RD::get_singleton()->free(device.point_clamp_sampler); + RD::get_singleton()->free(device.linear_clamp_sampler); + + for (uint32_t i = 0; i < FFX_FSR2_PASS_COUNT; i++) { + RD::get_singleton()->free(device.passes[i].pipeline.pipeline_rid); + device.passes[i].shader->version_free(device.passes[i].shader_version); + } +} + +FSR2Context *FSR2Effect::create_context(Size2i p_internal_size, Size2i p_target_size) { + FSR2Context *context = memnew(RendererRD::FSR2Context); + context->fsr_desc.flags = FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE; + context->fsr_desc.maxRenderSize.width = p_internal_size.x; + context->fsr_desc.maxRenderSize.height = p_internal_size.y; + context->fsr_desc.displaySize.width = p_target_size.x; + context->fsr_desc.displaySize.height = p_target_size.y; + context->fsr_desc.device = &device; + + FfxFsr2Interface &functions = context->fsr_desc.callbacks; + functions.fpCreateBackendContext = create_backend_context_rd; + functions.fpGetDeviceCapabilities = get_device_capabilities_rd; + functions.fpDestroyBackendContext = destroy_backend_context_rd; + functions.fpCreateResource = create_resource_rd; + functions.fpRegisterResource = register_resource_rd; + functions.fpUnregisterResources = unregister_resources_rd; + functions.fpGetResourceDescription = get_resource_description_rd; + functions.fpDestroyResource = destroy_resource_rd; + functions.fpCreatePipeline = create_pipeline_rd; + functions.fpDestroyPipeline = destroy_pipeline_rd; + functions.fpScheduleGpuJob = schedule_gpu_job_rd; + functions.fpExecuteGpuJobs = execute_gpu_jobs_rd; + functions.scratchBuffer = &context->scratch; + functions.scratchBufferSize = sizeof(context->scratch); + + FfxErrorCode result = ffxFsr2ContextCreate(&context->fsr_context, &context->fsr_desc); + if (result == FFX_OK) { + return context; + } else { + memdelete(context); + return nullptr; + } +} + +void FSR2Effect::upscale(const Parameters &p_params) { + // TODO: Transparency & Composition mask is not implemented. + FfxFsr2DispatchDescription dispatch_desc = {}; + RID color = p_params.color; + RID depth = p_params.depth; + RID velocity = p_params.velocity; + RID reactive = p_params.reactive; + RID exposure = p_params.exposure; + RID output = p_params.output; + dispatch_desc.commandList = nullptr; + dispatch_desc.color = get_resource_rd(&color, L"color"); + dispatch_desc.depth = get_resource_rd(&depth, L"depth"); + dispatch_desc.motionVectors = get_resource_rd(&velocity, L"velocity"); + dispatch_desc.reactive = get_resource_rd(&reactive, L"reactive"); + dispatch_desc.exposure = get_resource_rd(&exposure, L"exposure"); + dispatch_desc.transparencyAndComposition = {}; + dispatch_desc.output = get_resource_rd(&output, L"output"); + dispatch_desc.colorOpaqueOnly = {}; + dispatch_desc.jitterOffset.x = p_params.jitter.x; + dispatch_desc.jitterOffset.y = p_params.jitter.y; + dispatch_desc.motionVectorScale.x = float(p_params.internal_size.width); + dispatch_desc.motionVectorScale.y = float(p_params.internal_size.height); + dispatch_desc.reset = p_params.reset_accumulation; + dispatch_desc.renderSize.width = p_params.internal_size.width; + dispatch_desc.renderSize.height = p_params.internal_size.height; + dispatch_desc.enableSharpening = (p_params.sharpness > 1e-6f); + dispatch_desc.sharpness = p_params.sharpness; + dispatch_desc.frameTimeDelta = p_params.delta_time; + dispatch_desc.preExposure = 1.0f; + dispatch_desc.cameraNear = p_params.z_near; + dispatch_desc.cameraFar = p_params.z_far; + dispatch_desc.cameraFovAngleVertical = p_params.fovy; + dispatch_desc.viewSpaceToMetersFactor = 1.0f; + dispatch_desc.enableAutoReactive = false; + dispatch_desc.autoTcThreshold = 1.0f; + dispatch_desc.autoTcScale = 1.0f; + dispatch_desc.autoReactiveScale = 1.0f; + dispatch_desc.autoReactiveMax = 1.0f; + + RendererRD::MaterialStorage::store_camera(p_params.reprojection, dispatch_desc.reprojectionMatrix); + + FfxErrorCode result = ffxFsr2ContextDispatch(&p_params.context->fsr_context, &dispatch_desc); + ERR_FAIL_COND(result != FFX_OK); +} diff --git a/servers/rendering/renderer_rd/effects/fsr2.h b/servers/rendering/renderer_rd/effects/fsr2.h new file mode 100644 index 00000000000..789714cc77d --- /dev/null +++ b/servers/rendering/renderer_rd/effects/fsr2.h @@ -0,0 +1,199 @@ +/**************************************************************************/ +/* fsr2.h */ +/**************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/**************************************************************************/ +/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ +/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/**************************************************************************/ + +#ifndef FSR2_RD_H +#define FSR2_RD_H + +#include "servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_accumulate_pass.glsl.gen.h" +#include "servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_autogen_reactive_pass.glsl.gen.h" +#include "servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_compute_luminance_pyramid_pass.glsl.gen.h" +#include "servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_depth_clip_pass.glsl.gen.h" +#include "servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_lock_pass.glsl.gen.h" +#include "servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_rcas_pass.glsl.gen.h" +#include "servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_reconstruct_previous_depth_pass.glsl.gen.h" +#include "servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_tcr_autogen_pass.glsl.gen.h" + +// This flag doesn't actually control anything GCC specific in FSR2. It determines +// if symbols should be exported, which is not required for Godot. +#ifndef FFX_GCC +#define FFX_GCC +#endif + +#include "thirdparty/amd-fsr2/ffx_fsr2.h" + +#define FSR2_MAX_QUEUED_FRAMES (4) +#define FSR2_MAX_UNIFORM_BUFFERS (4) +#define FSR2_MAX_BUFFERED_DESCRIPTORS (FFX_FSR2_PASS_COUNT * FSR2_MAX_QUEUED_FRAMES) +#define FSR2_UBO_RING_BUFFER_SIZE (FSR2_MAX_BUFFERED_DESCRIPTORS * FSR2_MAX_UNIFORM_BUFFERS) + +namespace RendererRD { +class FSR2Context { +public: + enum ResourceID : uint32_t { + RESOURCE_ID_DYNAMIC = 0xFFFFFFFF + }; + + struct Resources { + LocalVector rids; + LocalVector> mip_slice_rids; + LocalVector ids; + LocalVector descriptions; + LocalVector dynamic_list; + LocalVector free_list; + + uint32_t add(RID p_rid, bool p_dynamic, uint32_t p_id, FfxResourceDescription p_description) { + uint32_t ret_index; + if (free_list.is_empty()) { + ret_index = rids.size(); + uint32_t new_size = ret_index + 1; + rids.resize(new_size); + mip_slice_rids.resize(new_size); + ids.resize(new_size); + descriptions.resize(new_size); + } else { + uint32_t end_index = free_list.size() - 1; + ret_index = free_list[end_index]; + free_list.resize(end_index); + } + + rids[ret_index] = p_rid; + mip_slice_rids[ret_index].clear(); + ids[ret_index] = p_id; + descriptions[ret_index] = p_description; + + if (p_dynamic) { + dynamic_list.push_back(ret_index); + } + + return ret_index; + } + + void remove(uint32_t p_index) { + DEV_ASSERT(p_index < rids.size()); + free_list.push_back(p_index); + rids[p_index] = RID(); + mip_slice_rids[p_index].clear(); + ids[p_index] = 0; + descriptions[p_index] = {}; + dynamic_list.erase(p_index); + } + + uint32_t size() const { + return rids.size(); + } + }; + + struct Scratch { + Resources resources; + LocalVector gpu_jobs; + RID ubo_ring_buffer[FSR2_UBO_RING_BUFFER_SIZE]; + uint32_t ubo_ring_buffer_index = 0; + FfxDevice device = nullptr; + }; + + Scratch scratch; + FfxFsr2Context fsr_context; + FfxFsr2ContextDescription fsr_desc; + + ~FSR2Context(); +}; + +class FSR2Effect { +public: + struct RootSignature { + // Proxy structure to store the shader required by RD that uses the terminology used by the FSR2 API. + RID shader_rid; + }; + + struct Pipeline { + RID pipeline_rid; + }; + + struct Pass { + ShaderRD *shader; + RID shader_version; + RootSignature root_signature; + uint32_t shader_variant = 0; + Pipeline pipeline; + Vector sampled_bindings; + Vector storage_bindings; + Vector uniform_bindings; + }; + + struct Device { + Pass passes[FFX_FSR2_PASS_COUNT]; + FfxDeviceCapabilities capabilities; + RID point_clamp_sampler; + RID linear_clamp_sampler; + }; + + struct Parameters { + FSR2Context *context; + Size2i internal_size; + RID color; + RID depth; + RID velocity; + RID reactive; + RID exposure; + RID output; + float z_near = 0.0f; + float z_far = 0.0f; + float fovy = 0.0f; + Vector2 jitter; + float delta_time = 0.0f; + float sharpness = 0.0f; + bool reset_accumulation = false; + Projection reprojection; + }; + + FSR2Effect(); + ~FSR2Effect(); + FSR2Context *create_context(Size2i p_internal_size, Size2i p_target_size); + void upscale(const Parameters &p_params); + +private: + struct { + Fsr2DepthClipPassShaderRD depth_clip; + Fsr2ReconstructPreviousDepthPassShaderRD reconstruct_previous_depth; + Fsr2LockPassShaderRD lock; + Fsr2AccumulatePassShaderRD accumulate; + Fsr2AccumulatePassShaderRD accumulate_sharpen; + Fsr2RcasPassShaderRD rcas; + Fsr2ComputeLuminancePyramidPassShaderRD compute_luminance_pyramid; + Fsr2AutogenReactivePassShaderRD autogen_reactive; + Fsr2TcrAutogenPassShaderRD tcr_autogen; + } shaders; + + Device device; +}; + +} // namespace RendererRD + +#endif // FSR2_RD_H diff --git a/servers/rendering/renderer_rd/effects/taa.cpp b/servers/rendering/renderer_rd/effects/taa.cpp index 61e0d3866ca..c1037ec11a2 100644 --- a/servers/rendering/renderer_rd/effects/taa.cpp +++ b/servers/rendering/renderer_rd/effects/taa.cpp @@ -47,20 +47,6 @@ TAA::~TAA() { taa_shader.version_free(shader_version); } -void TAA::msaa_resolve(Ref p_render_buffers) { - if (!p_render_buffers->has_velocity_buffer(true)) { - // nothing to resolve - return; - } - - for (uint32_t v = 0; v < p_render_buffers->get_view_count(); v++) { - RID velocity_buffer_msaa = p_render_buffers->get_velocity_buffer(true, v); - RID velocity_buffer = p_render_buffers->get_velocity_buffer(false, v); - - RD::get_singleton()->texture_resolve_multisample(velocity_buffer_msaa, velocity_buffer); - } -} - void TAA::resolve(RID p_frame, RID p_temp, RID p_depth, RID p_velocity, RID p_prev_velocity, RID p_history, Size2 p_resolution, float p_z_near, float p_z_far) { UniformSetCacheRD *uniform_set_cache = UniformSetCacheRD::get_singleton(); ERR_FAIL_NULL(uniform_set_cache); diff --git a/servers/rendering/renderer_rd/effects/taa.h b/servers/rendering/renderer_rd/effects/taa.h index 9e7ad76fb8f..f50e297fe56 100644 --- a/servers/rendering/renderer_rd/effects/taa.h +++ b/servers/rendering/renderer_rd/effects/taa.h @@ -45,7 +45,6 @@ public: TAA(); ~TAA(); - void msaa_resolve(Ref p_render_buffers); void process(Ref p_render_buffers, RD::DataFormat p_format, float p_z_near, float p_z_far); private: diff --git a/servers/rendering/renderer_rd/environment/sky.cpp b/servers/rendering/renderer_rd/environment/sky.cpp index 310aefe1b1a..0ccd36b340e 100644 --- a/servers/rendering/renderer_rd/environment/sky.cpp +++ b/servers/rendering/renderer_rd/environment/sky.cpp @@ -973,7 +973,7 @@ SkyRD::~SkyRD() { } } -void SkyRD::setup_sky(RID p_env, Ref p_render_buffers, const PagedArray &p_lights, RID p_camera_attributes, uint32_t p_view_count, const Projection *p_view_projections, const Vector3 *p_view_eye_offsets, const Transform3D &p_cam_transform, const Projection &p_cam_projection, const Size2i p_screen_size, RendererSceneRenderRD *p_scene_render) { +void SkyRD::setup_sky(RID p_env, Ref p_render_buffers, const PagedArray &p_lights, RID p_camera_attributes, uint32_t p_view_count, const Projection *p_view_projections, const Vector3 *p_view_eye_offsets, const Transform3D &p_cam_transform, const Projection &p_cam_projection, const Size2i p_screen_size, Vector2 p_jitter, RendererSceneRenderRD *p_scene_render) { RendererRD::LightStorage *light_storage = RendererRD::LightStorage::get_singleton(); RendererRD::MaterialStorage *material_storage = RendererRD::MaterialStorage::get_singleton(); ERR_FAIL_COND(p_env.is_null()); @@ -1173,18 +1173,21 @@ void SkyRD::setup_sky(RID p_env, Ref p_render_buffers, con } } + Projection correction; + correction.add_jitter_offset(p_jitter); + sky_scene_state.view_count = p_view_count; sky_scene_state.cam_transform = p_cam_transform; - sky_scene_state.cam_projection = p_cam_projection; // We only use this when rendering a single view. + sky_scene_state.cam_projection = correction * p_cam_projection; // We only use this when rendering a single view. // Our info in our UBO is only used if we're rendering stereo. for (uint32_t i = 0; i < p_view_count; i++) { - Projection view_inv_projection = p_view_projections[i].inverse(); + Projection view_inv_projection = (correction * p_view_projections[i]).inverse(); if (p_view_count > 1) { RendererRD::MaterialStorage::store_camera(p_cam_projection * view_inv_projection, sky_scene_state.ubo.combined_reprojection[i]); } else { Projection ident; - RendererRD::MaterialStorage::store_camera(ident, sky_scene_state.ubo.combined_reprojection[i]); + RendererRD::MaterialStorage::store_camera(correction, sky_scene_state.ubo.combined_reprojection[i]); } RendererRD::MaterialStorage::store_camera(view_inv_projection, sky_scene_state.ubo.view_inv_projections[i]); diff --git a/servers/rendering/renderer_rd/environment/sky.h b/servers/rendering/renderer_rd/environment/sky.h index ee2d81757ce..b146a416f9b 100644 --- a/servers/rendering/renderer_rd/environment/sky.h +++ b/servers/rendering/renderer_rd/environment/sky.h @@ -294,7 +294,7 @@ public: void set_texture_format(RD::DataFormat p_texture_format); ~SkyRD(); - void setup_sky(RID p_env, Ref p_render_buffers, const PagedArray &p_lights, RID p_camera_attributes, uint32_t p_view_count, const Projection *p_view_projections, const Vector3 *p_view_eye_offsets, const Transform3D &p_cam_transform, const Projection &p_cam_projection, const Size2i p_screen_size, RendererSceneRenderRD *p_scene_render); + void setup_sky(RID p_env, Ref p_render_buffers, const PagedArray &p_lights, RID p_camera_attributes, uint32_t p_view_count, const Projection *p_view_projections, const Vector3 *p_view_eye_offsets, const Transform3D &p_cam_transform, const Projection &p_cam_projection, const Size2i p_screen_size, Vector2 p_jitter, RendererSceneRenderRD *p_scene_render); void update_radiance_buffers(Ref p_render_buffers, RID p_env, const Vector3 &p_global_pos, double p_time, float p_luminance_multiplier = 1.0); void update_res_buffers(Ref p_render_buffers, RID p_env, double p_time, float p_luminance_multiplier = 1.0); void draw_sky(RD::DrawListID p_draw_list, Ref p_render_buffers, RID p_env, RID p_fb, double p_time, float p_luminance_multiplier = 1.0); diff --git a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp index 5456e5b1829..4709ae91264 100644 --- a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp +++ b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp @@ -105,6 +105,12 @@ void RenderForwardClustered::RenderBufferDataForwardClustered::ensure_voxelgi() } } +void RenderForwardClustered::RenderBufferDataForwardClustered::ensure_fsr2(RendererRD::FSR2Effect *p_effect) { + if (fsr2_context == nullptr) { + fsr2_context = p_effect->create_context(render_buffers->get_internal_size(), render_buffers->get_target_size()); + } +} + void RenderForwardClustered::RenderBufferDataForwardClustered::free_data() { // JIC, should already have been cleared if (render_buffers) { @@ -120,6 +126,11 @@ void RenderForwardClustered::RenderBufferDataForwardClustered::free_data() { cluster_builder = nullptr; } + if (fsr2_context) { + memdelete(fsr2_context); + fsr2_context = nullptr; + } + if (!render_sdfgi_uniform_set.is_null() && RD::get_singleton()->uniform_set_is_valid(render_sdfgi_uniform_set)) { RD::get_singleton()->free(render_sdfgi_uniform_set); } @@ -230,6 +241,14 @@ RID RenderForwardClustered::RenderBufferDataForwardClustered::get_specular_only_ return FramebufferCacheRD::get_singleton()->get_cache_multiview(render_buffers->get_view_count(), specular); } +RID RenderForwardClustered::RenderBufferDataForwardClustered::get_velocity_only_fb() { + bool use_msaa = render_buffers->get_msaa_3d() != RS::VIEWPORT_MSAA_DISABLED; + + RID velocity = render_buffers->get_texture(RB_SCOPE_BUFFERS, use_msaa ? RB_TEX_VELOCITY_MSAA : RB_TEX_VELOCITY); + + return FramebufferCacheRD::get_singleton()->get_cache_multiview(render_buffers->get_view_count(), velocity); +} + void RenderForwardClustered::setup_render_buffer_data(Ref p_render_buffers) { Ref data; data.instantiate(); @@ -285,8 +304,10 @@ void RenderForwardClustered::_render_list_template(RenderingDevice::DrawListID p const GeometryInstanceSurfaceDataCache *surf = p_params->elements[i]; const RenderElementInfo &element_info = p_params->element_info[i]; - if ((p_pass_mode == PASS_MODE_COLOR && !(p_color_pass_flags & COLOR_PASS_FLAG_TRANSPARENT)) && !(surf->flags & GeometryInstanceSurfaceDataCache::FLAG_PASS_OPAQUE)) { - continue; // Objects with "Depth-prepass" transparency are included in both render lists, but should only be rendered in the transparent pass + if (p_pass_mode == PASS_MODE_COLOR && surf->color_pass_inclusion_mask && (p_color_pass_flags & surf->color_pass_inclusion_mask) == 0) { + // Some surfaces can be repeated in multiple render lists. We exclude them from being rendered on the color pass based on the + // features supported by the pass compared to the exclusion mask. + continue; } if (surf->owner->instance_count == 0) { @@ -582,7 +603,7 @@ void RenderForwardClustered::_render_list_with_threads(RenderListParameters *p_p } } -void RenderForwardClustered::_setup_environment(const RenderDataRD *p_render_data, bool p_no_fog, const Size2i &p_screen_size, bool p_flip_y, const Color &p_default_bg_color, bool p_opaque_render_buffers, bool p_pancake_shadows, int p_index) { +void RenderForwardClustered::_setup_environment(const RenderDataRD *p_render_data, bool p_no_fog, const Size2i &p_screen_size, bool p_flip_y, const Color &p_default_bg_color, bool p_opaque_render_buffers, bool p_apply_alpha_multiplier, bool p_pancake_shadows, int p_index) { RendererRD::LightStorage *light_storage = RendererRD::LightStorage::get_singleton(); Ref rd = p_render_data->render_buffers; @@ -598,7 +619,7 @@ void RenderForwardClustered::_setup_environment(const RenderDataRD *p_render_dat } } - p_render_data->scene_data->update_ubo(scene_state.uniform_buffers[p_index], get_debug_draw_mode(), env, reflection_probe_instance, p_render_data->camera_attributes, p_flip_y, p_pancake_shadows, p_screen_size, p_default_bg_color, _render_buffers_get_luminance_multiplier(), p_opaque_render_buffers); + p_render_data->scene_data->update_ubo(scene_state.uniform_buffers[p_index], get_debug_draw_mode(), env, reflection_probe_instance, p_render_data->camera_attributes, p_flip_y, p_pancake_shadows, p_screen_size, p_default_bg_color, _render_buffers_get_luminance_multiplier(), p_opaque_render_buffers, p_apply_alpha_multiplier); // now do implementation UBO @@ -775,8 +796,9 @@ _FORCE_INLINE_ static uint32_t _indices_to_primitives(RS::PrimitiveType p_primit static const uint32_t subtractor[RS::PRIMITIVE_MAX] = { 0, 0, 1, 0, 1 }; return (p_indices - subtractor[p_primitive]) / divisor[p_primitive]; } -void RenderForwardClustered::_fill_render_list(RenderListType p_render_list, const RenderDataRD *p_render_data, PassMode p_pass_mode, uint32_t p_color_pass_flags = 0, bool p_using_sdfgi, bool p_using_opaque_gi, bool p_append) { +void RenderForwardClustered::_fill_render_list(RenderListType p_render_list, const RenderDataRD *p_render_data, PassMode p_pass_mode, bool p_using_sdfgi, bool p_using_opaque_gi, bool p_using_motion_pass, bool p_append) { RendererRD::MeshStorage *mesh_storage = RendererRD::MeshStorage::get_singleton(); + uint64_t frame = RSG::rasterizer->get_frame_number(); if (p_render_list == RENDER_LIST_OPAQUE) { scene_state.used_sss = false; @@ -797,7 +819,9 @@ void RenderForwardClustered::_fill_render_list(RenderListType p_render_list, con if (!p_append) { rl->clear(); if (p_render_list == RENDER_LIST_OPAQUE) { - render_list[RENDER_LIST_ALPHA].clear(); //opaque fills alpha too + // Opaque fills motion and alpha lists. + render_list[RENDER_LIST_MOTION].clear(); + render_list[RENDER_LIST_ALPHA].clear(); } } @@ -827,6 +851,7 @@ void RenderForwardClustered::_fill_render_list(RenderListType p_render_list, con } bool uses_lightmap = false; bool uses_gi = false; + bool uses_motion = false; float fade_alpha = 1.0; if (inst->fade_near || inst->fade_far) { @@ -914,6 +939,14 @@ void RenderForwardClustered::_fill_render_list(RenderListType p_render_list, con inst->gi_offset_cache = 0xFFFFFFFF; } } + + if (p_pass_mode == PASS_MODE_COLOR && p_using_motion_pass) { + bool transform_changed = inst->prev_transform_change_frame == frame; + bool has_mesh_instance = inst->mesh_instance.is_valid(); + bool uses_particles = inst->base_flags & INSTANCE_DATA_FLAG_PARTICLES; + bool is_multimesh_with_motion = !uses_particles && (inst->base_flags & INSTANCE_DATA_FLAG_MULTIMESH) && mesh_storage->_multimesh_uses_motion_vectors_offsets(inst->data->base); + uses_motion = transform_changed || has_mesh_instance || uses_particles || is_multimesh_with_motion; + } } inst->flags_cache = flags; @@ -990,11 +1023,18 @@ void RenderForwardClustered::_fill_render_list(RenderListType p_render_list, con if (!force_alpha && (surf->flags & (GeometryInstanceSurfaceDataCache::FLAG_PASS_DEPTH | GeometryInstanceSurfaceDataCache::FLAG_PASS_OPAQUE))) { rl->add_element(surf); } + if (force_alpha || (surf->flags & GeometryInstanceSurfaceDataCache::FLAG_PASS_ALPHA)) { + surf->color_pass_inclusion_mask = COLOR_PASS_FLAG_TRANSPARENT; render_list[RENDER_LIST_ALPHA].add_element(surf); if (uses_gi) { surf->sort.uses_forward_gi = 1; } + } else if (p_using_motion_pass && (uses_motion || (surf->flags & GeometryInstanceSurfaceDataCache::FLAG_USES_MOTION_VECTOR))) { + surf->color_pass_inclusion_mask = COLOR_PASS_FLAG_MOTION_VECTORS; + render_list[RENDER_LIST_MOTION].add_element(surf); + } else { + surf->color_pass_inclusion_mask = 0; } if (uses_lightmap) { @@ -1580,16 +1620,24 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co RENDER_TIMESTAMP("Setup 3D Scene"); + bool using_debug_mvs = get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_MOTION_VECTORS; + bool using_taa = rb->get_use_taa(); + bool using_fsr2 = rb->get_scaling_3d_mode() == RS::VIEWPORT_SCALING_3D_MODE_FSR2; + // check if we need motion vectors - if (get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_MOTION_VECTORS) { - p_render_data->scene_data->calculate_motion_vectors = true; - } else if (!is_reflection_probe && rb->get_use_taa()) { - p_render_data->scene_data->calculate_motion_vectors = true; + bool motion_vectors_required; + if (using_debug_mvs) { + motion_vectors_required = true; + } else if (!is_reflection_probe && using_taa) { + motion_vectors_required = true; + } else if (!is_reflection_probe && using_fsr2) { + motion_vectors_required = true; } else { - p_render_data->scene_data->calculate_motion_vectors = false; + motion_vectors_required = false; } //p_render_data->scene_data->subsurface_scatter_width = subsurface_scatter_size; + p_render_data->scene_data->calculate_motion_vectors = motion_vectors_required; p_render_data->scene_data->directional_light_count = 0; p_render_data->scene_data->opaque_prepass_threshold = 0.99f; @@ -1607,6 +1655,7 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co bool using_voxelgi = false; bool reverse_cull = p_render_data->scene_data->cam_transform.basis.determinant() < 0; bool using_ssil = !is_reflection_probe && p_render_data->environment.is_valid() && environment_get_ssil_enabled(p_render_data->environment); + bool using_motion_pass = rb_data.is_valid() && using_fsr2; if (is_reflection_probe) { uint32_t resolution = light_storage->reflection_probe_instance_get_resolution(p_render_data->reflection_probe); @@ -1625,7 +1674,7 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co } else { screen_size = rb->get_internal_size(); - if (rb->get_use_taa() || get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_MOTION_VECTORS) { + if (p_render_data->scene_data->calculate_motion_vectors) { color_pass_flags |= COLOR_PASS_FLAG_MOTION_VECTORS; scene_shader.enable_advanced_shader_group(); } @@ -1663,12 +1712,16 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co _setup_voxelgis(*p_render_data->voxel_gi_instances); _setup_environment(p_render_data, is_reflection_probe, screen_size, !is_reflection_probe, p_default_bg_color, false); - _update_render_base_uniform_set(rb->get_samplers()); //may have changed due to the above (light buffer enlarged, as an example) + _update_render_base_uniform_set(rb->get_samplers()); // May have changed due to the above (light buffer enlarged, as an example). - _fill_render_list(RENDER_LIST_OPAQUE, p_render_data, PASS_MODE_COLOR, color_pass_flags, using_sdfgi, using_sdfgi || using_voxelgi); + _fill_render_list(RENDER_LIST_OPAQUE, p_render_data, PASS_MODE_COLOR, using_sdfgi, using_sdfgi || using_voxelgi, using_motion_pass); render_list[RENDER_LIST_OPAQUE].sort_by_key(); + render_list[RENDER_LIST_MOTION].sort_by_key(); render_list[RENDER_LIST_ALPHA].sort_by_reverse_depth_and_priority(); - _fill_instance_data(RENDER_LIST_OPAQUE, p_render_data->render_info ? p_render_data->render_info->info[RS::VIEWPORT_RENDER_INFO_TYPE_VISIBLE] : (int *)nullptr); + + int *render_info = p_render_data->render_info ? p_render_data->render_info->info[RS::VIEWPORT_RENDER_INFO_TYPE_VISIBLE] : (int *)nullptr; + _fill_instance_data(RENDER_LIST_OPAQUE, render_info); + _fill_instance_data(RENDER_LIST_MOTION, render_info); _fill_instance_data(RENDER_LIST_ALPHA); RD::get_singleton()->draw_command_end_label(); @@ -1792,9 +1845,9 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co correction.set_depth_correction(true); Projection projection = correction * p_render_data->scene_data->cam_projection; - sky.setup_sky(p_render_data->environment, rb, *p_render_data->lights, p_render_data->camera_attributes, 1, &projection, &eye_offset, p_render_data->scene_data->cam_transform, projection, screen_size, this); + sky.setup_sky(p_render_data->environment, rb, *p_render_data->lights, p_render_data->camera_attributes, 1, &projection, &eye_offset, p_render_data->scene_data->cam_transform, projection, screen_size, Vector2(0.0f, 0.0f), this); } else { - sky.setup_sky(p_render_data->environment, rb, *p_render_data->lights, p_render_data->camera_attributes, p_render_data->scene_data->view_count, p_render_data->scene_data->view_projection, p_render_data->scene_data->view_eye_offset, p_render_data->scene_data->cam_transform, p_render_data->scene_data->cam_projection, screen_size, this); + sky.setup_sky(p_render_data->environment, rb, *p_render_data->lights, p_render_data->camera_attributes, p_render_data->scene_data->view_count, p_render_data->scene_data->view_projection, p_render_data->scene_data->view_eye_offset, p_render_data->scene_data->cam_transform, p_render_data->scene_data->cam_projection, screen_size, p_render_data->scene_data->taa_jitter, this); } sky_energy_multiplier *= bg_energy_multiplier; @@ -1892,37 +1945,71 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co // Shadow pass can change the base uniform set samplers. _update_render_base_uniform_set(rb->get_samplers()); - _setup_environment(p_render_data, is_reflection_probe, screen_size, !is_reflection_probe, p_default_bg_color, true); + _setup_environment(p_render_data, is_reflection_probe, screen_size, !is_reflection_probe, p_default_bg_color, true, using_motion_pass); RENDER_TIMESTAMP("Render Opaque Pass"); RID rp_uniform_set = _setup_render_pass_uniform_set(RENDER_LIST_OPAQUE, p_render_data, radiance_texture, true); - bool can_continue_color = !scene_state.used_screen_texture && !using_ssr && !using_sss; bool can_continue_depth = !(scene_state.used_depth_texture || scene_state.used_normal_texture) && !using_ssr && !using_sss; { + bool render_motion_pass = !render_list[RENDER_LIST_MOTION].elements.is_empty(); bool will_continue_color = (can_continue_color || draw_sky || draw_sky_fog_only || debug_voxelgis || debug_sdfgi_probes); bool will_continue_depth = (can_continue_depth || draw_sky || draw_sky_fog_only || debug_voxelgis || debug_sdfgi_probes); + RD::FinalAction final_color_action = will_continue_color ? RD::FINAL_ACTION_CONTINUE : RD::FINAL_ACTION_READ; + RD::FinalAction final_depth_action = will_continue_depth ? RD::FINAL_ACTION_CONTINUE : RD::FINAL_ACTION_READ; - Vector c; { - Color cc = clear_color.srgb_to_linear(); - if (using_separate_specular || rb_data.is_valid()) { - cc.a = 0; //subsurf scatter must be 0 - } - c.push_back(cc); + Vector c; + { + Color cc = clear_color.srgb_to_linear(); + if (using_separate_specular || rb_data.is_valid()) { + // Effects that rely on separate specular, like subsurface scattering, must clear the alpha to zero. + cc.a = 0; + } + c.push_back(cc); - if (rb_data.is_valid()) { - c.push_back(Color(0, 0, 0, 0)); // Separate specular - c.push_back(Color(0, 0, 0, 0)); // Motion vectors + if (rb_data.is_valid()) { + c.push_back(Color(0, 0, 0, 0)); // Separate specular. + c.push_back(Color(0, 0, 0, 0)); // Motion vector. Pushed to the clear color vector even if the framebuffer isn't bound. + } + } + + uint32_t opaque_color_pass_flags = using_motion_pass ? (color_pass_flags & ~COLOR_PASS_FLAG_MOTION_VECTORS) : color_pass_flags; + RID opaque_framebuffer = using_motion_pass ? rb_data->get_color_pass_fb(opaque_color_pass_flags) : color_framebuffer; + RenderListParameters render_list_params(render_list[RENDER_LIST_OPAQUE].elements.ptr(), render_list[RENDER_LIST_OPAQUE].element_info.ptr(), render_list[RENDER_LIST_OPAQUE].elements.size(), reverse_cull, PASS_MODE_COLOR, opaque_color_pass_flags, rb_data.is_null(), p_render_data->directional_light_soft_shadows, rp_uniform_set, get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_WIREFRAME, Vector2(), p_render_data->scene_data->lod_distance_multiplier, p_render_data->scene_data->screen_mesh_lod_threshold, p_render_data->scene_data->view_count); + _render_list_with_threads(&render_list_params, opaque_framebuffer, keep_color ? RD::INITIAL_ACTION_KEEP : RD::INITIAL_ACTION_CLEAR, render_motion_pass ? RD::FINAL_ACTION_CONTINUE : final_color_action, depth_pre_pass ? (continue_depth ? RD::INITIAL_ACTION_CONTINUE : RD::INITIAL_ACTION_KEEP) : RD::INITIAL_ACTION_CLEAR, render_motion_pass ? RD::FINAL_ACTION_CONTINUE : final_depth_action, c, 1.0, 0); + } + + RD::get_singleton()->draw_command_end_label(); + + if (using_motion_pass) { + Vector motion_vector_clear_colors; + motion_vector_clear_colors.push_back(Color(-1, -1, 0, 0)); + RD::get_singleton()->draw_list_begin(rb_data->get_velocity_only_fb(), RD::INITIAL_ACTION_CLEAR, render_motion_pass ? RD::FINAL_ACTION_CONTINUE : RD::FINAL_ACTION_READ, RD::INITIAL_ACTION_CONTINUE, RD::FINAL_ACTION_CONTINUE, motion_vector_clear_colors); + RD::get_singleton()->draw_list_end(); + } + + if (render_motion_pass) { + RD::get_singleton()->draw_command_begin_label("Render Motion Pass"); + + RENDER_TIMESTAMP("Render Motion Pass"); + + rp_uniform_set = _setup_render_pass_uniform_set(RENDER_LIST_MOTION, p_render_data, radiance_texture, true); + + RenderListParameters render_list_params(render_list[RENDER_LIST_MOTION].elements.ptr(), render_list[RENDER_LIST_MOTION].element_info.ptr(), render_list[RENDER_LIST_MOTION].elements.size(), reverse_cull, PASS_MODE_COLOR, color_pass_flags, rb_data.is_null(), p_render_data->directional_light_soft_shadows, rp_uniform_set, get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_WIREFRAME, Vector2(), p_render_data->scene_data->lod_distance_multiplier, p_render_data->scene_data->screen_mesh_lod_threshold, p_render_data->scene_data->view_count); + _render_list_with_threads(&render_list_params, color_framebuffer, RD::INITIAL_ACTION_CONTINUE, final_color_action, RD::INITIAL_ACTION_CONTINUE, final_depth_action); + + if (will_continue_color) { + // Close the motion vectors framebuffer as it'll no longer be used. + RD::get_singleton()->draw_list_begin(rb_data->get_velocity_only_fb(), RD::INITIAL_ACTION_CONTINUE, RD::FINAL_ACTION_READ, RD::INITIAL_ACTION_CONTINUE, RD::FINAL_ACTION_CONTINUE); + RD::get_singleton()->draw_list_end(); } } - RenderListParameters render_list_params(render_list[RENDER_LIST_OPAQUE].elements.ptr(), render_list[RENDER_LIST_OPAQUE].element_info.ptr(), render_list[RENDER_LIST_OPAQUE].elements.size(), reverse_cull, PASS_MODE_COLOR, color_pass_flags, rb_data.is_null(), p_render_data->directional_light_soft_shadows, rp_uniform_set, get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_WIREFRAME, Vector2(), p_render_data->scene_data->lod_distance_multiplier, p_render_data->scene_data->screen_mesh_lod_threshold, p_render_data->scene_data->view_count); - _render_list_with_threads(&render_list_params, color_framebuffer, keep_color ? RD::INITIAL_ACTION_KEEP : RD::INITIAL_ACTION_CLEAR, will_continue_color ? RD::FINAL_ACTION_CONTINUE : RD::FINAL_ACTION_READ, depth_pre_pass ? (continue_depth ? RD::INITIAL_ACTION_CONTINUE : RD::INITIAL_ACTION_KEEP) : RD::INITIAL_ACTION_CLEAR, will_continue_depth ? RD::FINAL_ACTION_CONTINUE : RD::FINAL_ACTION_READ, c, 1.0, 0); if (will_continue_color && using_separate_specular) { - // close the specular framebuffer, as it's no longer used + // Close the specular framebuffer as it'll no longer be used. RD::get_singleton()->draw_list_begin(rb_data->get_specular_only_fb(), RD::INITIAL_ACTION_CONTINUE, RD::FINAL_ACTION_READ, RD::INITIAL_ACTION_CONTINUE, RD::FINAL_ACTION_CONTINUE); RD::get_singleton()->draw_list_end(); } @@ -2052,6 +2139,11 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co { uint32_t transparent_color_pass_flags = (color_pass_flags | COLOR_PASS_FLAG_TRANSPARENT) & ~(COLOR_PASS_FLAG_SEPARATE_SPECULAR); + if (using_motion_pass) { + // Motion vectors on transparent draw calls are not required when using the reactive mask. + transparent_color_pass_flags &= ~(COLOR_PASS_FLAG_MOTION_VECTORS); + } + RID alpha_framebuffer = rb_data.is_valid() ? rb_data->get_color_pass_fb(transparent_color_pass_flags) : color_only_framebuffer; RenderListParameters render_list_params(render_list[RENDER_LIST_ALPHA].elements.ptr(), render_list[RENDER_LIST_ALPHA].element_info.ptr(), render_list[RENDER_LIST_ALPHA].elements.size(), false, PASS_MODE_COLOR, transparent_color_pass_flags, rb_data.is_null(), p_render_data->directional_light_soft_shadows, rp_uniform_set, get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_WIREFRAME, Vector2(), p_render_data->scene_data->lod_distance_multiplier, p_render_data->scene_data->screen_mesh_lod_threshold, p_render_data->scene_data->view_count); _render_list_with_threads(&render_list_params, alpha_framebuffer, can_continue_color ? RD::INITIAL_ACTION_CONTINUE : RD::INITIAL_ACTION_KEEP, RD::FINAL_ACTION_READ, can_continue_depth ? RD::INITIAL_ACTION_CONTINUE : RD::INITIAL_ACTION_KEEP, RD::FINAL_ACTION_READ); @@ -2064,12 +2156,14 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co RD::get_singleton()->draw_command_begin_label("Resolve"); if (rb_data.is_valid() && rb->get_msaa_3d() != RS::VIEWPORT_MSAA_DISABLED) { + bool resolve_velocity_buffer = (using_taa || using_fsr2) && rb->has_velocity_buffer(true); for (uint32_t v = 0; v < rb->get_view_count(); v++) { RD::get_singleton()->texture_resolve_multisample(rb->get_color_msaa(v), rb->get_internal_texture(v)); resolve_effects->resolve_depth(rb->get_depth_msaa(v), rb->get_depth_texture(v), rb->get_internal_size(), texture_multisamples[rb->get_msaa_3d()]); - } - if (taa && rb->get_use_taa()) { - taa->msaa_resolve(rb); + + if (resolve_velocity_buffer) { + RD::get_singleton()->texture_resolve_multisample(rb->get_velocity_buffer(true, v), rb->get_velocity_buffer(false, v)); + } } } @@ -2082,9 +2176,51 @@ void RenderForwardClustered::_render_scene(RenderDataRD *p_render_data, const Co } RD::get_singleton()->draw_command_end_label(); - if (rb_data.is_valid() && taa && rb->get_use_taa()) { - RENDER_TIMESTAMP("TAA") - taa->process(rb, _render_buffers_get_color_format(), p_render_data->scene_data->z_near, p_render_data->scene_data->z_far); + if (rb_data.is_valid() && (using_fsr2 || using_taa)) { + if (using_fsr2) { + rb->ensure_upscaled(); + rb_data->ensure_fsr2(fsr2_effect); + + RID exposure; + if (RSG::camera_attributes->camera_attributes_uses_auto_exposure(p_render_data->camera_attributes)) { + exposure = luminance->get_current_luminance_buffer(rb); + } + + RENDER_TIMESTAMP("FSR2"); + for (uint32_t v = 0; v < rb->get_view_count(); v++) { + real_t fov = p_render_data->scene_data->cam_projection.get_fov(); + real_t aspect = p_render_data->scene_data->cam_projection.get_aspect(); + real_t fovy = p_render_data->scene_data->cam_projection.get_fovy(fov, aspect); + Vector2 jitter = p_render_data->scene_data->taa_jitter * Vector2(rb->get_internal_size()) * 0.5f; + RendererRD::FSR2Effect::Parameters params; + params.context = rb_data->get_fsr2_context(); + params.internal_size = rb->get_internal_size(); + params.sharpness = CLAMP(1.0f - (rb->get_fsr_sharpness() / 2.0f), 0.0f, 1.0f); + params.color = rb->get_internal_texture(v); + params.depth = rb->get_depth_texture(v); + params.velocity = rb->get_velocity_buffer(false, v); + params.reactive = rb->get_internal_texture_reactive(v); + params.exposure = exposure; + params.output = rb->get_upscaled_texture(v); + params.z_near = p_render_data->scene_data->z_near; + params.z_far = p_render_data->scene_data->z_far; + params.fovy = fovy; + params.jitter = jitter; + params.delta_time = float(time_step); + params.reset_accumulation = false; // FIXME: The engine does not provide a way to reset the accumulation. + + const Projection &prev_proj = p_render_data->scene_data->prev_cam_projection; + const Projection &cur_proj = p_render_data->scene_data->cam_projection; + const Transform3D &prev_transform = p_render_data->scene_data->prev_cam_transform; + const Transform3D &cur_transform = p_render_data->scene_data->cam_transform; + params.reprojection = prev_proj.flipped_y() * prev_transform.affine_inverse() * cur_transform * cur_proj.flipped_y().inverse(); + + fsr2_effect->upscale(params); + } + } else if (using_taa) { + RENDER_TIMESTAMP("TAA"); + taa->process(rb, _render_buffers_get_color_format(), p_render_data->scene_data->z_near, p_render_data->scene_data->z_far); + } } if (rb_data.is_valid()) { @@ -2357,7 +2493,7 @@ void RenderForwardClustered::_render_shadow_append(RID p_framebuffer, const Page render_data.instances = &p_instances; render_data.render_info = p_render_info; - _setup_environment(&render_data, true, Vector2(1, 1), !p_flip_y, Color(), false, p_use_pancake, shadow_pass_index); + _setup_environment(&render_data, true, Vector2(1, 1), !p_flip_y, Color(), false, false, p_use_pancake, shadow_pass_index); if (get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_DISABLE_LOD) { scene_data.screen_mesh_lod_threshold = 0.0; @@ -2368,7 +2504,7 @@ void RenderForwardClustered::_render_shadow_append(RID p_framebuffer, const Page PassMode pass_mode = p_use_dp ? PASS_MODE_SHADOW_DP : PASS_MODE_SHADOW; uint32_t render_list_from = render_list[RENDER_LIST_SECONDARY].elements.size(); - _fill_render_list(RENDER_LIST_SECONDARY, &render_data, pass_mode, 0, false, false, true); + _fill_render_list(RENDER_LIST_SECONDARY, &render_data, pass_mode, false, false, false, true); uint32_t render_list_size = render_list[RENDER_LIST_SECONDARY].elements.size() - render_list_from; render_list[RENDER_LIST_SECONDARY].sort_by_key_range(render_list_from, render_list_size); _fill_instance_data(RENDER_LIST_SECONDARY, p_render_info ? p_render_info->info[RS::VIEWPORT_RENDER_INFO_TYPE_SHADOW] : (int *)nullptr, render_list_from, render_list_size, false); @@ -2453,7 +2589,7 @@ void RenderForwardClustered::_render_particle_collider_heightfield(RID p_fb, con _update_render_base_uniform_set(RendererRD::MaterialStorage::get_singleton()->samplers_rd_get_default()); - _setup_environment(&render_data, true, Vector2(1, 1), true, Color(), false, false); + _setup_environment(&render_data, true, Vector2(1, 1), true, Color(), false, false, false); PassMode pass_mode = PASS_MODE_SHADOW; @@ -3475,6 +3611,10 @@ void RenderForwardClustered::_geometry_instance_add_surface_with_material(Geomet flags |= GeometryInstanceSurfaceDataCache::FLAG_USES_PARTICLE_TRAILS; } + if (p_material->shader_data->is_animated()) { + flags |= GeometryInstanceSurfaceDataCache::FLAG_USES_MOTION_VECTOR; + } + SceneShaderForwardClustered::MaterialData *material_shadow = nullptr; void *surface_shadow = nullptr; if (!p_material->shader_data->uses_particle_trails && !p_material->shader_data->writes_modelview_or_projection && !p_material->shader_data->uses_vertex && !p_material->shader_data->uses_position && !p_material->shader_data->uses_discard && !p_material->shader_data->uses_depth_prepass_alpha && !p_material->shader_data->uses_alpha_clip && !p_material->shader_data->uses_alpha_antialiasing && p_material->shader_data->cull_mode == SceneShaderForwardClustered::ShaderData::CULL_BACK && !p_material->shader_data->uses_point_size) { @@ -3982,6 +4122,7 @@ RenderForwardClustered::RenderForwardClustered() { resolve_effects = memnew(RendererRD::Resolve()); taa = memnew(RendererRD::TAA); + fsr2_effect = memnew(RendererRD::FSR2Effect); ss_effects = memnew(RendererRD::SSEffects); } @@ -3996,6 +4137,11 @@ RenderForwardClustered::~RenderForwardClustered() { taa = nullptr; } + if (fsr2_effect) { + memdelete(fsr2_effect); + fsr2_effect = nullptr; + } + if (resolve_effects != nullptr) { memdelete(resolve_effects); resolve_effects = nullptr; diff --git a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h index 39584bae631..6955d4f6ef3 100644 --- a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h +++ b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h @@ -33,6 +33,7 @@ #include "core/templates/paged_allocator.h" #include "servers/rendering/renderer_rd/cluster_builder_rd.h" +#include "servers/rendering/renderer_rd/effects/fsr2.h" #include "servers/rendering/renderer_rd/effects/resolve.h" #include "servers/rendering/renderer_rd/effects/ss_effects.h" #include "servers/rendering/renderer_rd/effects/taa.h" @@ -84,6 +85,7 @@ class RenderForwardClustered : public RendererSceneRenderRD { enum RenderListType { RENDER_LIST_OPAQUE, //used for opaque objects + RENDER_LIST_MOTION, //used for opaque objects with motion RENDER_LIST_ALPHA, //used for transparent objects RENDER_LIST_SECONDARY, //used for shadows and other objects RENDER_LIST_MAX @@ -100,6 +102,7 @@ class RenderForwardClustered : public RendererSceneRenderRD { private: RenderSceneBuffersRD *render_buffers = nullptr; + RendererRD::FSR2Context *fsr2_context = nullptr; public: ClusterBuilderRD *cluster_builder = nullptr; @@ -140,10 +143,14 @@ class RenderForwardClustered : public RendererSceneRenderRD { RID get_voxelgi(uint32_t p_layer) { return render_buffers->get_texture_slice(RB_SCOPE_FORWARD_CLUSTERED, RB_TEX_VOXEL_GI, p_layer, 0); } RID get_voxelgi_msaa(uint32_t p_layer) { return render_buffers->get_texture_slice(RB_SCOPE_FORWARD_CLUSTERED, RB_TEX_VOXEL_GI_MSAA, p_layer, 0); } + void ensure_fsr2(RendererRD::FSR2Effect *p_effect); + RendererRD::FSR2Context *get_fsr2_context() const { return fsr2_context; } + RID get_color_only_fb(); RID get_color_pass_fb(uint32_t p_color_pass_flags); RID get_depth_fb(DepthFrameBufferType p_type = DEPTH_FB); RID get_specular_only_fb(); + RID get_velocity_only_fb(); virtual void configure(RenderSceneBuffersRD *p_render_buffers) override; virtual void free_data() override; @@ -345,7 +352,7 @@ class RenderForwardClustered : public RendererSceneRenderRD { static RenderForwardClustered *singleton; - void _setup_environment(const RenderDataRD *p_render_data, bool p_no_fog, const Size2i &p_screen_size, bool p_flip_y, const Color &p_default_bg_color, bool p_opaque_render_buffers = false, bool p_pancake_shadows = false, int p_index = 0); + void _setup_environment(const RenderDataRD *p_render_data, bool p_no_fog, const Size2i &p_screen_size, bool p_flip_y, const Color &p_default_bg_color, bool p_opaque_render_buffers = false, bool p_apply_alpha_multiplier = false, bool p_pancake_shadows = false, int p_index = 0); void _setup_voxelgis(const PagedArray &p_voxelgis); void _setup_lightmaps(const RenderDataRD *p_render_data, const PagedArray &p_lightmaps, const Transform3D &p_cam_transform); @@ -372,7 +379,7 @@ class RenderForwardClustered : public RendererSceneRenderRD { void _update_instance_data_buffer(RenderListType p_render_list); void _fill_instance_data(RenderListType p_render_list, int *p_render_info = nullptr, uint32_t p_offset = 0, int32_t p_max_elements = -1, bool p_update_buffer = true); - void _fill_render_list(RenderListType p_render_list, const RenderDataRD *p_render_data, PassMode p_pass_mode, uint32_t p_color_pass_flags, bool p_using_sdfgi = false, bool p_using_opaque_gi = false, bool p_append = false); + void _fill_render_list(RenderListType p_render_list, const RenderDataRD *p_render_data, PassMode p_pass_mode, bool p_using_sdfgi = false, bool p_using_opaque_gi = false, bool p_using_motion_pass = false, bool p_append = false); HashMap sdfgi_framebuffer_size_cache; @@ -397,6 +404,7 @@ class RenderForwardClustered : public RendererSceneRenderRD { FLAG_USES_NORMAL_TEXTURE = 16384, FLAG_USES_DOUBLE_SIDED_SHADOWS = 32768, FLAG_USES_PARTICLE_TRAILS = 65536, + FLAG_USES_MOTION_VECTOR = 131072, }; union { @@ -424,6 +432,7 @@ class RenderForwardClustered : public RendererSceneRenderRD { RS::PrimitiveType primitive = RS::PRIMITIVE_MAX; uint32_t flags = 0; uint32_t surface_index = 0; + uint32_t color_pass_inclusion_mask = 0; void *surface = nullptr; RID material_uniform_set; @@ -563,6 +572,7 @@ class RenderForwardClustered : public RendererSceneRenderRD { RendererRD::Resolve *resolve_effects = nullptr; RendererRD::TAA *taa = nullptr; + RendererRD::FSR2Effect *fsr2_effect = nullptr; RendererRD::SSEffects *ss_effects = nullptr; /* Cluster builder */ diff --git a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp index d5f81da29f0..71852b5aedf 100644 --- a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp +++ b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp @@ -823,9 +823,9 @@ void RenderForwardMobile::_render_scene(RenderDataRD *p_render_data, const Color correction.set_depth_correction(true); Projection projection = correction * p_render_data->scene_data->cam_projection; - sky.setup_sky(p_render_data->environment, p_render_data->render_buffers, *p_render_data->lights, p_render_data->camera_attributes, 1, &projection, &eye_offset, p_render_data->scene_data->cam_transform, projection, screen_size, this); + sky.setup_sky(p_render_data->environment, p_render_data->render_buffers, *p_render_data->lights, p_render_data->camera_attributes, 1, &projection, &eye_offset, p_render_data->scene_data->cam_transform, projection, screen_size, Vector2(0.0f, 0.0f), this); } else { - sky.setup_sky(p_render_data->environment, p_render_data->render_buffers, *p_render_data->lights, p_render_data->camera_attributes, p_render_data->scene_data->view_count, p_render_data->scene_data->view_projection, p_render_data->scene_data->view_eye_offset, p_render_data->scene_data->cam_transform, p_render_data->scene_data->cam_projection, screen_size, this); + sky.setup_sky(p_render_data->environment, p_render_data->render_buffers, *p_render_data->lights, p_render_data->camera_attributes, p_render_data->scene_data->view_count, p_render_data->scene_data->view_projection, p_render_data->scene_data->view_eye_offset, p_render_data->scene_data->cam_transform, p_render_data->scene_data->cam_projection, screen_size, p_render_data->scene_data->taa_jitter, this); } sky_energy_multiplier *= bg_energy_multiplier; @@ -1908,7 +1908,7 @@ void RenderForwardMobile::_setup_environment(const RenderDataRD *p_render_data, } } - p_render_data->scene_data->update_ubo(scene_state.uniform_buffers[p_index], get_debug_draw_mode(), env, reflection_probe_instance, p_render_data->camera_attributes, p_flip_y, p_pancake_shadows, p_screen_size, p_default_bg_color, _render_buffers_get_luminance_multiplier(), p_opaque_render_buffers); + p_render_data->scene_data->update_ubo(scene_state.uniform_buffers[p_index], get_debug_draw_mode(), env, reflection_probe_instance, p_render_data->camera_attributes, p_flip_y, p_pancake_shadows, p_screen_size, p_default_bg_color, _render_buffers_get_luminance_multiplier(), p_opaque_render_buffers, false); } void RenderForwardMobile::_fill_element_info(RenderListType p_render_list, uint32_t p_offset, int32_t p_max_elements) { diff --git a/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp b/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp index 20e24dba0eb..7696bddbca8 100644 --- a/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp +++ b/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp @@ -340,14 +340,16 @@ void RendererSceneRenderRD::_render_buffers_post_process_and_tonemap(const Rende // Glow, auto exposure and DoF (if enabled). - Size2i internal_size = rb->get_internal_size(); Size2i target_size = rb->get_target_size(); - bool can_use_effects = target_size.x >= 8 && target_size.y >= 8; // FIXME I think this should check internal size, we do all our post processing at this size... bool can_use_storage = _render_buffers_can_be_storage(); + bool use_fsr = fsr && can_use_effects && rb->get_scaling_3d_mode() == RS::VIEWPORT_SCALING_3D_MODE_FSR; + bool use_upscaled_texture = rb->has_upscaled_texture() && rb->get_scaling_3d_mode() == RS::VIEWPORT_SCALING_3D_MODE_FSR2; + RID render_target = rb->get_render_target(); - RID internal_texture = rb->get_internal_texture(); + RID color_texture = use_upscaled_texture ? rb->get_upscaled_texture() : rb->get_internal_texture(); + Size2i color_size = use_upscaled_texture ? target_size : rb->get_internal_size(); if (can_use_effects && RSG::camera_attributes->camera_attributes_uses_dof(p_render_data->camera_attributes)) { RENDER_TIMESTAMP("Depth of Field"); @@ -358,14 +360,14 @@ void RendererSceneRenderRD::_render_buffers_post_process_and_tonemap(const Rende RendererRD::BokehDOF::BokehBuffers buffers; // Textures we use - buffers.base_texture_size = rb->get_internal_size(); + buffers.base_texture_size = color_size; buffers.secondary_texture = rb->get_texture_slice(RB_SCOPE_BUFFERS, RB_TEX_BLUR_0, 0, 0); buffers.half_texture[0] = rb->get_texture_slice(RB_SCOPE_BUFFERS, RB_TEX_BLUR_1, 0, 0); buffers.half_texture[1] = rb->get_texture_slice(RB_SCOPE_BUFFERS, RB_TEX_BLUR_0, 0, 1); if (can_use_storage) { for (uint32_t i = 0; i < rb->get_view_count(); i++) { - buffers.base_texture = rb->get_internal_texture(i); + buffers.base_texture = use_upscaled_texture ? rb->get_upscaled_texture(i) : rb->get_internal_texture(i); buffers.depth_texture = rb->get_depth_texture(i); // In stereo p_render_data->z_near and p_render_data->z_far can be offset for our combined frustum. @@ -387,7 +389,7 @@ void RendererSceneRenderRD::_render_buffers_post_process_and_tonemap(const Rende buffers.base_weight_fb = rb->weight_buffers[0].fb; for (uint32_t i = 0; i < rb->get_view_count(); i++) { - buffers.base_texture = rb->get_internal_texture(i); + buffers.base_texture = use_upscaled_texture ? rb->get_upscaled_texture(i) : rb->get_internal_texture(i); buffers.depth_texture = rb->get_depth_texture(i); buffers.base_fb = FramebufferCacheRD::get_singleton()->get_cache(buffers.base_texture); // TODO move this into bokeh_dof_raster, we can do this internally @@ -416,7 +418,7 @@ void RendererSceneRenderRD::_render_buffers_post_process_and_tonemap(const Rende double step = RSG::camera_attributes->camera_attributes_get_auto_exposure_adjust_speed(p_render_data->camera_attributes) * time_step; float auto_exposure_min_sensitivity = RSG::camera_attributes->camera_attributes_get_auto_exposure_min_sensitivity(p_render_data->camera_attributes); float auto_exposure_max_sensitivity = RSG::camera_attributes->camera_attributes_get_auto_exposure_max_sensitivity(p_render_data->camera_attributes); - luminance->luminance_reduction(internal_texture, internal_size, luminance_buffers, auto_exposure_min_sensitivity, auto_exposure_max_sensitivity, step, set_immediate); + luminance->luminance_reduction(color_texture, color_size, luminance_buffers, auto_exposure_min_sensitivity, auto_exposure_max_sensitivity, step, set_immediate); // Swap final reduce with prev luminance. @@ -525,7 +527,7 @@ void RendererSceneRenderRD::_render_buffers_post_process_and_tonemap(const Rende } tonemap.use_debanding = rb->get_use_debanding(); - tonemap.texture_size = Vector2i(rb->get_internal_size().x, rb->get_internal_size().y); + tonemap.texture_size = Vector2i(color_size.x, color_size.y); if (p_render_data->environment.is_valid()) { tonemap.tonemap_mode = environment_get_tone_mapper(p_render_data->environment); @@ -555,7 +557,8 @@ void RendererSceneRenderRD::_render_buffers_post_process_and_tonemap(const Rende tonemap.convert_to_srgb = !texture_storage->render_target_is_using_hdr(render_target); RID dest_fb; - if (fsr && can_use_effects && rb->get_scaling_3d_mode() == RS::VIEWPORT_SCALING_3D_MODE_FSR) { + bool use_intermediate_fb = use_fsr; + if (use_intermediate_fb) { // If we use FSR to upscale we need to write our result into an intermediate buffer. // Note that this is cached so we only create the texture the first time. RID dest_texture = rb->create_texture(SNAME("Tonemapper"), SNAME("destination"), _render_buffers_get_color_format(), RD::TEXTURE_USAGE_SAMPLING_BIT | RD::TEXTURE_USAGE_STORAGE_BIT | RD::TEXTURE_USAGE_COLOR_ATTACHMENT_BIT); @@ -567,12 +570,12 @@ void RendererSceneRenderRD::_render_buffers_post_process_and_tonemap(const Rende dest_fb = texture_storage->render_target_get_rd_framebuffer(render_target); } - tone_mapper->tonemapper(internal_texture, dest_fb, tonemap); + tone_mapper->tonemapper(color_texture, dest_fb, tonemap); RD::get_singleton()->draw_command_end_label(); } - if (fsr && can_use_effects && rb->get_scaling_3d_mode() == RS::VIEWPORT_SCALING_3D_MODE_FSR) { + if (use_fsr) { RD::get_singleton()->draw_command_begin_label("FSR 1.0 Upscale"); for (uint32_t v = 0; v < rb->get_view_count(); v++) { @@ -732,6 +735,11 @@ void RendererSceneRenderRD::_render_buffers_debug_draw(const RenderDataRD *p_ren } } + if (debug_draw == RS::VIEWPORT_DEBUG_DRAW_INTERNAL_BUFFER) { + Size2 rtsize = texture_storage->render_target_get_size(render_target); + copy_effects->copy_to_fb_rect(rb->get_internal_texture(), texture_storage->render_target_get_rd_framebuffer(render_target), Rect2(Vector2(), rtsize), false, false); + } + if (debug_draw == RS::VIEWPORT_DEBUG_DRAW_NORMAL_BUFFER && _render_buffers_get_normal_texture(rb).is_valid()) { Size2 rtsize = texture_storage->render_target_get_size(render_target); copy_effects->copy_to_fb_rect(_render_buffers_get_normal_texture(rb), texture_storage->render_target_get_rd_framebuffer(render_target), Rect2(Vector2(), rtsize), false, false); @@ -745,7 +753,12 @@ void RendererSceneRenderRD::_render_buffers_debug_draw(const RenderDataRD *p_ren } if (debug_draw == RS::VIEWPORT_DEBUG_DRAW_MOTION_VECTORS && _render_buffers_get_velocity_texture(rb).is_valid()) { - debug_effects->draw_motion_vectors(_render_buffers_get_velocity_texture(rb), texture_storage->render_target_get_rd_framebuffer(render_target), rb->get_internal_size()); + RID velocity = _render_buffers_get_velocity_texture(rb); + RID depth = rb->get_depth_texture(); + RID dest_fb = texture_storage->render_target_get_rd_framebuffer(render_target); + Size2i resolution = rb->get_internal_size(); + + debug_effects->draw_motion_vectors(velocity, depth, dest_fb, p_render_data->scene_data->cam_projection, p_render_data->scene_data->cam_transform, p_render_data->scene_data->prev_cam_projection, p_render_data->scene_data->prev_cam_transform, resolution); } } diff --git a/servers/rendering/renderer_rd/shaders/effects/SCsub b/servers/rendering/renderer_rd/shaders/effects/SCsub index f06a2d86e24..810f781340c 100644 --- a/servers/rendering/renderer_rd/shaders/effects/SCsub +++ b/servers/rendering/renderer_rd/shaders/effects/SCsub @@ -15,3 +15,5 @@ if "RD_GLSL" in env["BUILDERS"]: # compile shaders for glsl_file in glsl_files: env.RD_GLSL(glsl_file) + +SConscript("fsr2/SCsub") diff --git a/servers/rendering/renderer_rd/shaders/effects/fsr2/SCsub b/servers/rendering/renderer_rd/shaders/effects/fsr2/SCsub new file mode 100644 index 00000000000..f06a2d86e24 --- /dev/null +++ b/servers/rendering/renderer_rd/shaders/effects/fsr2/SCsub @@ -0,0 +1,17 @@ +#!/usr/bin/env python + +Import("env") + +if "RD_GLSL" in env["BUILDERS"]: + # find all include files + gl_include_files = [str(f) for f in Glob("*_inc.glsl")] + [str(f) for f in Glob("../*_inc.glsl")] + + # find all shader code(all glsl files excluding our include files) + glsl_files = [str(f) for f in Glob("*.glsl") if str(f) not in gl_include_files] + + # make sure we recompile shaders if include files change + env.Depends([f + ".gen.h" for f in glsl_files], gl_include_files + ["#glsl_builders.py"]) + + # compile shaders + for glsl_file in glsl_files: + env.RD_GLSL(glsl_file) diff --git a/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_accumulate_pass.glsl b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_accumulate_pass.glsl new file mode 100644 index 00000000000..67fce9a3429 --- /dev/null +++ b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_accumulate_pass.glsl @@ -0,0 +1,8 @@ +#[compute] + +#version 450 + +#VERSION_DEFINES + +#include "../motion_vector_inc.glsl" +#include "thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl" diff --git a/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_autogen_reactive_pass.glsl b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_autogen_reactive_pass.glsl new file mode 100644 index 00000000000..d362958aa69 --- /dev/null +++ b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_autogen_reactive_pass.glsl @@ -0,0 +1,8 @@ +#[compute] + +#version 450 + +#VERSION_DEFINES + +#include "../motion_vector_inc.glsl" +#include "thirdparty/amd-fsr2/shaders/ffx_fsr2_autogen_reactive_pass.glsl" diff --git a/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_compute_luminance_pyramid_pass.glsl b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_compute_luminance_pyramid_pass.glsl new file mode 100644 index 00000000000..37504c2e530 --- /dev/null +++ b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_compute_luminance_pyramid_pass.glsl @@ -0,0 +1,7 @@ +#[compute] + +#version 450 + +#VERSION_DEFINES + +#include "thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl" diff --git a/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_depth_clip_pass.glsl b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_depth_clip_pass.glsl new file mode 100644 index 00000000000..0ee08e4c76f --- /dev/null +++ b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_depth_clip_pass.glsl @@ -0,0 +1,8 @@ +#[compute] + +#version 450 + +#VERSION_DEFINES + +#include "../motion_vector_inc.glsl" +#include "thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip_pass.glsl" diff --git a/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_lock_pass.glsl b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_lock_pass.glsl new file mode 100644 index 00000000000..8c8430d4b1c --- /dev/null +++ b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_lock_pass.glsl @@ -0,0 +1,7 @@ +#[compute] + +#version 450 + +#VERSION_DEFINES + +#include "thirdparty/amd-fsr2/shaders/ffx_fsr2_lock_pass.glsl" diff --git a/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_rcas_pass.glsl b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_rcas_pass.glsl new file mode 100644 index 00000000000..4120cfe6449 --- /dev/null +++ b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_rcas_pass.glsl @@ -0,0 +1,7 @@ +#[compute] + +#version 450 + +#VERSION_DEFINES + +#include "thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas_pass.glsl" diff --git a/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_reconstruct_previous_depth_pass.glsl b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_reconstruct_previous_depth_pass.glsl new file mode 100644 index 00000000000..f31abec215d --- /dev/null +++ b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_reconstruct_previous_depth_pass.glsl @@ -0,0 +1,8 @@ +#[compute] + +#version 450 + +#VERSION_DEFINES + +#include "../motion_vector_inc.glsl" +#include "thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl" diff --git a/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_tcr_autogen_pass.glsl b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_tcr_autogen_pass.glsl new file mode 100644 index 00000000000..818374e43c9 --- /dev/null +++ b/servers/rendering/renderer_rd/shaders/effects/fsr2/fsr2_tcr_autogen_pass.glsl @@ -0,0 +1,8 @@ +#[compute] + +#version 450 + +#VERSION_DEFINES + +#include "../motion_vector_inc.glsl" +#include "thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl" diff --git a/servers/rendering/renderer_rd/shaders/effects/motion_vector_inc.glsl b/servers/rendering/renderer_rd/shaders/effects/motion_vector_inc.glsl new file mode 100644 index 00000000000..cbf202653e7 --- /dev/null +++ b/servers/rendering/renderer_rd/shaders/effects/motion_vector_inc.glsl @@ -0,0 +1,6 @@ +vec2 derive_motion_vector(vec2 uv, float depth, mat4 reprojection_matrix) { + vec4 previous_pos_ndc = reprojection_matrix * vec4(uv * 2.0f - 1.0f, depth * 2.0f - 1.0f, 1.0f); + return 0.5f + (previous_pos_ndc.xy / previous_pos_ndc.w) * 0.5f - uv; +} + +#define FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS_FUNCTION(i, j, k) derive_motion_vector(i, j, k) diff --git a/servers/rendering/renderer_rd/shaders/effects/motion_vectors.glsl b/servers/rendering/renderer_rd/shaders/effects/motion_vectors.glsl index 80e4f51565c..d02ffe0b4f2 100644 --- a/servers/rendering/renderer_rd/shaders/effects/motion_vectors.glsl +++ b/servers/rendering/renderer_rd/shaders/effects/motion_vectors.glsl @@ -18,14 +18,19 @@ void main() { #VERSION_DEFINES +#include "motion_vector_inc.glsl" + layout(location = 0) in vec2 uv_interp; layout(set = 0, binding = 0) uniform sampler2D source_velocity; +layout(set = 0, binding = 1) uniform sampler2D source_depth; layout(location = 0) out vec4 frag_color; layout(push_constant, std430) uniform Params { + highp mat4 reprojection_matrix; vec2 resolution; + bool force_derive_from_depth; } params; @@ -49,7 +54,14 @@ void main() { vec2 pos_pixel = uv_interp * params.resolution; vec2 cell_pos_pixel = floor(pos_pixel / cell_size) * cell_size + (cell_size * 0.5f); vec2 cell_pos_uv = cell_pos_pixel / params.resolution; - vec2 cell_pos_previous_uv = cell_pos_uv + textureLod(source_velocity, cell_pos_uv, 0.0f).xy; + vec2 cell_pos_velocity = textureLod(source_velocity, cell_pos_uv, 0.0f).xy; + bool derive_velocity = params.force_derive_from_depth || all(lessThanEqual(cell_pos_velocity, vec2(-1.0f, -1.0f))); + if (derive_velocity) { + float depth = textureLod(source_depth, cell_pos_uv, 0.0f).x; + cell_pos_velocity = derive_motion_vector(cell_pos_uv, depth, params.reprojection_matrix); + } + + vec2 cell_pos_previous_uv = cell_pos_uv + cell_pos_velocity; // Draw the shapes. float epsilon = 1e-6f; @@ -76,5 +88,10 @@ void main() { alpha = 0.0f; } + if (derive_velocity) { + color = vec3(1.0f, 1.0f, 1.0f) - color; + alpha *= 0.5f; + } + frag_color = vec4(color, alpha); } diff --git a/servers/rendering/renderer_rd/shaders/forward_clustered/scene_forward_clustered.glsl b/servers/rendering/renderer_rd/shaders/forward_clustered/scene_forward_clustered.glsl index 2b8b8fa9d25..878b629c25b 100644 --- a/servers/rendering/renderer_rd/shaders/forward_clustered/scene_forward_clustered.glsl +++ b/servers/rendering/renderer_rd/shaders/forward_clustered/scene_forward_clustered.glsl @@ -2280,6 +2280,8 @@ void fragment_shader(in SceneData scene_data) { #else //MODE_SEPARATE_SPECULAR + alpha *= scene_data.pass_alpha_multiplier; + #ifdef MODE_UNSHADED frag_color = vec4(albedo, alpha); #else diff --git a/servers/rendering/renderer_rd/shaders/scene_data_inc.glsl b/servers/rendering/renderer_rd/shaders/scene_data_inc.glsl index b57ee185217..f42fafc68a2 100644 --- a/servers/rendering/renderer_rd/shaders/scene_data_inc.glsl +++ b/servers/rendering/renderer_rd/shaders/scene_data_inc.glsl @@ -64,6 +64,6 @@ struct SceneData { bool pancake_shadows; uint camera_visible_layers; - uint pad2; + float pass_alpha_multiplier; uint pad3; }; diff --git a/servers/rendering/renderer_rd/storage_rd/mesh_storage.cpp b/servers/rendering/renderer_rd/storage_rd/mesh_storage.cpp index 439d0702f52..487c3144720 100644 --- a/servers/rendering/renderer_rd/storage_rd/mesh_storage.cpp +++ b/servers/rendering/renderer_rd/storage_rd/mesh_storage.cpp @@ -1392,12 +1392,18 @@ void MeshStorage::_multimesh_get_motion_vectors_offsets(RID p_multimesh, uint32_ MultiMesh *multimesh = multimesh_owner.get_or_null(p_multimesh); ERR_FAIL_COND(!multimesh); r_current_offset = multimesh->motion_vectors_current_offset; - if (RSG::rasterizer->get_frame_number() - multimesh->motion_vectors_last_change >= 2) { + if (!_multimesh_uses_motion_vectors(multimesh)) { multimesh->motion_vectors_previous_offset = multimesh->motion_vectors_current_offset; } r_prev_offset = multimesh->motion_vectors_previous_offset; } +bool MeshStorage::_multimesh_uses_motion_vectors_offsets(RID p_multimesh) { + MultiMesh *multimesh = multimesh_owner.get_or_null(p_multimesh); + ERR_FAIL_NULL_V(multimesh, false); + return _multimesh_uses_motion_vectors(multimesh); +} + int MeshStorage::multimesh_get_instance_count(RID p_multimesh) const { MultiMesh *multimesh = multimesh_owner.get_or_null(p_multimesh); ERR_FAIL_COND_V(!multimesh, 0); @@ -1500,6 +1506,10 @@ void MeshStorage::_multimesh_update_motion_vectors_data_cache(MultiMesh *multime } } +bool MeshStorage::_multimesh_uses_motion_vectors(MultiMesh *multimesh) { + return (RSG::rasterizer->get_frame_number() - multimesh->motion_vectors_last_change) < 2; +} + void MeshStorage::_multimesh_mark_dirty(MultiMesh *multimesh, int p_index, bool p_aabb) { uint32_t region_index = p_index / MULTIMESH_DIRTY_REGION_SIZE; #ifdef DEBUG_ENABLED diff --git a/servers/rendering/renderer_rd/storage_rd/mesh_storage.h b/servers/rendering/renderer_rd/storage_rd/mesh_storage.h index 99ba69f98a3..ba973b92a76 100644 --- a/servers/rendering/renderer_rd/storage_rd/mesh_storage.h +++ b/servers/rendering/renderer_rd/storage_rd/mesh_storage.h @@ -244,6 +244,7 @@ private: _FORCE_INLINE_ void _multimesh_make_local(MultiMesh *multimesh) const; _FORCE_INLINE_ void _multimesh_enable_motion_vectors(MultiMesh *multimesh); _FORCE_INLINE_ void _multimesh_update_motion_vectors_data_cache(MultiMesh *multimesh); + _FORCE_INLINE_ bool _multimesh_uses_motion_vectors(MultiMesh *multimesh); _FORCE_INLINE_ void _multimesh_mark_dirty(MultiMesh *multimesh, int p_index, bool p_aabb); _FORCE_INLINE_ void _multimesh_mark_all_dirty(MultiMesh *multimesh, bool p_data, bool p_aabb); _FORCE_INLINE_ void _multimesh_re_create_aabb(MultiMesh *multimesh, const float *p_data, int p_instances); @@ -622,6 +623,8 @@ public: void _update_dirty_multimeshes(); void _multimesh_get_motion_vectors_offsets(RID p_multimesh, uint32_t &r_current_offset, uint32_t &r_prev_offset); + bool _multimesh_uses_motion_vectors_offsets(RID p_multimesh); + bool _multimesh_uses_motion_vectors(RID p_multimesh); _FORCE_INLINE_ RS::MultimeshTransformFormat multimesh_get_transform_format(RID p_multimesh) const { MultiMesh *multimesh = multimesh_owner.get_or_null(p_multimesh); diff --git a/servers/rendering/renderer_rd/storage_rd/render_scene_buffers_rd.cpp b/servers/rendering/renderer_rd/storage_rd/render_scene_buffers_rd.cpp index 5b59a453dd6..5ff5adc59ad 100644 --- a/servers/rendering/renderer_rd/storage_rd/render_scene_buffers_rd.cpp +++ b/servers/rendering/renderer_rd/storage_rd/render_scene_buffers_rd.cpp @@ -52,6 +52,7 @@ void RenderSceneBuffersRD::_bind_methods() { ClassDB::bind_method(D_METHOD("get_texture", "context", "name"), &RenderSceneBuffersRD::get_texture); ClassDB::bind_method(D_METHOD("get_texture_format", "context", "name"), &RenderSceneBuffersRD::_get_texture_format); ClassDB::bind_method(D_METHOD("get_texture_slice", "context", "name", "layer", "mipmap", "layers", "mipmaps"), &RenderSceneBuffersRD::get_texture_slice); + ClassDB::bind_method(D_METHOD("get_texture_slice_view", "context", "name", "layer", "mipmap", "layers", "mipmaps", "view"), &RenderSceneBuffersRD::_get_texture_slice_view); ClassDB::bind_method(D_METHOD("get_texture_slice_size", "context", "name", "mipmap"), &RenderSceneBuffersRD::get_texture_slice_size); ClassDB::bind_method(D_METHOD("clear_context", "context"), &RenderSceneBuffersRD::clear_context); @@ -95,8 +96,8 @@ void RenderSceneBuffersRD::free_named_texture(NamedTexture &p_named_texture) { void RenderSceneBuffersRD::update_samplers() { float computed_mipmap_bias = texture_mipmap_bias; - if (use_taa) { - // Use negative mipmap LOD bias when TAA is enabled to compensate for loss of sharpness. + if (use_taa || (scaling_3d_mode == RS::VIEWPORT_SCALING_3D_MODE_FSR2)) { + // Use negative mipmap LOD bias when TAA or FSR2 is enabled to compensate for loss of sharpness. // This restores sharpness in still images to be roughly at the same level as without TAA, // but moving scenes will still be blurrier. computed_mipmap_bias -= 0.5; @@ -388,6 +389,15 @@ Ref RenderSceneBuffersRD::_get_texture_format(const StringName return tf; } +RID RenderSceneBuffersRD::_get_texture_slice_view(const StringName &p_context, const StringName &p_texture_name, const uint32_t p_layer, const uint32_t p_mipmap, const uint32_t p_layers, const uint32_t p_mipmaps, const Ref p_view) { + RD::TextureView texture_view; + if (p_view.is_valid()) { + texture_view = p_view->base; + } + + return get_texture_slice_view(p_context, p_texture_name, p_layer, p_mipmap, p_layers, p_mipmaps, texture_view); +} + const RD::TextureFormat RenderSceneBuffersRD::get_texture_format(const StringName &p_context, const StringName &p_texture_name) const { NTKey key(p_context, p_texture_name); @@ -397,6 +407,10 @@ const RD::TextureFormat RenderSceneBuffersRD::get_texture_format(const StringNam } RID RenderSceneBuffersRD::get_texture_slice(const StringName &p_context, const StringName &p_texture_name, const uint32_t p_layer, const uint32_t p_mipmap, const uint32_t p_layers, const uint32_t p_mipmaps) { + return get_texture_slice_view(p_context, p_texture_name, p_layer, p_mipmap, p_layers, p_mipmaps, RD::TextureView()); +} + +RID RenderSceneBuffersRD::get_texture_slice_view(const StringName &p_context, const StringName &p_texture_name, const uint32_t p_layer, const uint32_t p_mipmap, const uint32_t p_layers, const uint32_t p_mipmaps, RD::TextureView p_view) { NTKey key(p_context, p_texture_name); // check if this is a known texture @@ -413,19 +427,20 @@ RID RenderSceneBuffersRD::get_texture_slice(const StringName &p_context, const S ERR_FAIL_COND_V(p_mipmap + p_mipmaps > named_texture.format.mipmaps, RID()); // asking the whole thing? just return the original - if (p_layer == 0 && p_mipmap == 0 && named_texture.format.array_layers == p_layers && named_texture.format.mipmaps == p_mipmaps) { + RD::TextureView default_view = RD::TextureView(); + if (p_layer == 0 && p_mipmap == 0 && named_texture.format.array_layers == p_layers && named_texture.format.mipmaps == p_mipmaps && p_view == default_view) { return named_texture.texture; } // see if we have this - NTSliceKey slice_key(p_layer, p_layers, p_mipmap, p_mipmaps); + NTSliceKey slice_key(p_layer, p_layers, p_mipmap, p_mipmaps, p_view); if (named_texture.slices.has(slice_key)) { return named_texture.slices[slice_key]; } // create our slice RID &slice = named_texture.slices[slice_key]; - slice = RD::get_singleton()->texture_create_shared_from_slice(RD::TextureView(), named_texture.texture, p_layer, p_mipmap, p_mipmaps, p_layers > 1 ? RD::TEXTURE_SLICE_2D_ARRAY : RD::TEXTURE_SLICE_2D, p_layers); + slice = RD::get_singleton()->texture_create_shared_from_slice(p_view, named_texture.texture, p_layer, p_mipmap, p_mipmaps, p_layers > 1 ? RD::TEXTURE_SLICE_2D_ARRAY : RD::TEXTURE_SLICE_2D, p_layers); Array arr; arr.push_back(p_context); @@ -434,7 +449,12 @@ RID RenderSceneBuffersRD::get_texture_slice(const StringName &p_context, const S arr.push_back(itos(p_layers)); arr.push_back(itos(p_mipmap)); arr.push_back(itos(p_mipmaps)); - RD::get_singleton()->set_resource_name(slice, String("RenderBuffer {0}/{1}, layer {2}/{3}, mipmap {4}/{5}").format(arr)); + arr.push_back(itos(p_view.format_override)); + arr.push_back(itos(p_view.swizzle_r)); + arr.push_back(itos(p_view.swizzle_g)); + arr.push_back(itos(p_view.swizzle_b)); + arr.push_back(itos(p_view.swizzle_a)); + RD::get_singleton()->set_resource_name(slice, String("RenderBuffer {0}/{1}, layer {2}/{3}, mipmap {4}/{5}, view {6}/{7}/{8}/{9}/{10}").format(arr)); // and return our slice return slice; @@ -479,7 +499,13 @@ void RenderSceneBuffersRD::allocate_blur_textures() { return; } - uint32_t mipmaps_required = Image::get_image_required_mipmaps(internal_size.x, internal_size.y, Image::FORMAT_RGBAH); + Size2i blur_size = internal_size; + if (scaling_3d_mode == RS::VIEWPORT_SCALING_3D_MODE_FSR2) { + // The blur texture should be as big as the target size when using an upscaler. + blur_size = target_size; + } + + uint32_t mipmaps_required = Image::get_image_required_mipmaps(blur_size.x, blur_size.y, Image::FORMAT_RGBAH); uint32_t usage_bits = RD::TEXTURE_USAGE_SAMPLING_BIT | RD::TEXTURE_USAGE_CAN_COPY_TO_BIT; if (can_be_storage) { @@ -488,12 +514,12 @@ void RenderSceneBuffersRD::allocate_blur_textures() { usage_bits += RD::TEXTURE_USAGE_COLOR_ATTACHMENT_BIT; } - create_texture(RB_SCOPE_BUFFERS, RB_TEX_BLUR_0, base_data_format, usage_bits, RD::TEXTURE_SAMPLES_1, internal_size, view_count, mipmaps_required); - create_texture(RB_SCOPE_BUFFERS, RB_TEX_BLUR_1, base_data_format, usage_bits, RD::TEXTURE_SAMPLES_1, Size2i(internal_size.x >> 1, internal_size.y >> 1), view_count, mipmaps_required - 1); + create_texture(RB_SCOPE_BUFFERS, RB_TEX_BLUR_0, base_data_format, usage_bits, RD::TEXTURE_SAMPLES_1, blur_size, view_count, mipmaps_required); + create_texture(RB_SCOPE_BUFFERS, RB_TEX_BLUR_1, base_data_format, usage_bits, RD::TEXTURE_SAMPLES_1, Size2i(blur_size.x >> 1, blur_size.y >> 1), view_count, mipmaps_required - 1); // if !can_be_storage we need a half width version if (!can_be_storage) { - create_texture(RB_SCOPE_BUFFERS, RB_TEX_HALF_BLUR, base_data_format, usage_bits, RD::TEXTURE_SAMPLES_1, Size2i(internal_size.x >> 1, internal_size.y), 1, mipmaps_required); + create_texture(RB_SCOPE_BUFFERS, RB_TEX_HALF_BLUR, base_data_format, usage_bits, RD::TEXTURE_SAMPLES_1, Size2i(blur_size.x >> 1, blur_size.y), 1, mipmaps_required); } // TODO redo this: @@ -502,8 +528,8 @@ void RenderSceneBuffersRD::allocate_blur_textures() { RD::TextureFormat tf; tf.format = RD::DATA_FORMAT_R16_SFLOAT; // We could probably use DATA_FORMAT_R8_SNORM if we don't pre-multiply by blur_size but that depends on whether we can remove DEPTH_GAP - tf.width = internal_size.x; - tf.height = internal_size.y; + tf.width = blur_size.x; + tf.height = blur_size.y; tf.texture_type = RD::TEXTURE_TYPE_2D; tf.array_layers = 1; // Our DOF effect handles one eye per turn tf.usage_bits = RD::TEXTURE_USAGE_SAMPLING_BIT | RD::TEXTURE_USAGE_CAN_COPY_TO_BIT | RD::TEXTURE_USAGE_COLOR_ATTACHMENT_BIT; @@ -603,6 +629,16 @@ RID RenderSceneBuffersRD::get_depth_texture(const uint32_t p_layer) { } } +// Upscaled texture. + +void RenderSceneBuffersRD::ensure_upscaled() { + if (!has_upscaled_texture()) { + uint32_t usage_bits = RD::TEXTURE_USAGE_SAMPLING_BIT | (can_be_storage ? RD::TEXTURE_USAGE_STORAGE_BIT : 0) | RD::TEXTURE_USAGE_COLOR_ATTACHMENT_BIT; + usage_bits |= RD::TEXTURE_USAGE_INPUT_ATTACHMENT_BIT; + create_texture(RB_SCOPE_BUFFERS, RB_TEX_COLOR_UPSCALED, base_data_format, usage_bits, RD::TEXTURE_SAMPLES_1, target_size); + } +} + // Velocity texture. void RenderSceneBuffersRD::ensure_velocity() { diff --git a/servers/rendering/renderer_rd/storage_rd/render_scene_buffers_rd.h b/servers/rendering/renderer_rd/storage_rd/render_scene_buffers_rd.h index ddd64cb41c3..43704119e75 100644 --- a/servers/rendering/renderer_rd/storage_rd/render_scene_buffers_rd.h +++ b/servers/rendering/renderer_rd/storage_rd/render_scene_buffers_rd.h @@ -31,6 +31,7 @@ #ifndef RENDER_SCENE_BUFFERS_RD_H #define RENDER_SCENE_BUFFERS_RD_H +#include "../effects/fsr2.h" #include "../effects/vrs.h" #include "../framebuffer_cache_rd.h" #include "core/templates/hash_map.h" @@ -47,6 +48,7 @@ #define RB_TEXTURE SNAME("texture") #define RB_TEX_COLOR SNAME("color") #define RB_TEX_COLOR_MSAA SNAME("color_msaa") +#define RB_TEX_COLOR_UPSCALED SNAME("color_upscaled") #define RB_TEX_DEPTH SNAME("depth") #define RB_TEX_DEPTH_MSAA SNAME("depth_msaa") #define RB_TEX_VELOCITY SNAME("velocity") @@ -114,9 +116,10 @@ private: uint32_t layers; uint32_t mipmap; uint32_t mipmaps; + RD::TextureView texture_view; bool operator==(const NTSliceKey &p_val) const { - return (layer == p_val.layer) && (layers == p_val.layers) && (mipmap == p_val.mipmap) && (mipmaps == p_val.mipmaps); + return (layer == p_val.layer) && (layers == p_val.layers) && (mipmap == p_val.mipmap) && (mipmaps == p_val.mipmaps) && (texture_view == p_val.texture_view); } static uint32_t hash(const NTSliceKey &p_val) { @@ -124,15 +127,21 @@ private: h = hash_murmur3_one_32(p_val.layers, h); h = hash_murmur3_one_32(p_val.mipmap, h); h = hash_murmur3_one_32(p_val.mipmaps, h); + h = hash_murmur3_one_32(p_val.texture_view.format_override); + h = hash_murmur3_one_32(p_val.texture_view.swizzle_r, h); + h = hash_murmur3_one_32(p_val.texture_view.swizzle_g, h); + h = hash_murmur3_one_32(p_val.texture_view.swizzle_b, h); + h = hash_murmur3_one_32(p_val.texture_view.swizzle_a, h); return hash_fmix32(h); } NTSliceKey() {} - NTSliceKey(uint32_t p_layer, uint32_t p_layers, uint32_t p_mipmap, uint32_t p_mipmaps) { + NTSliceKey(uint32_t p_layer, uint32_t p_layers, uint32_t p_mipmap, uint32_t p_mipmaps, RD::TextureView p_texture_view) { layer = p_layer; layers = p_layers; mipmap = p_mipmap; mipmaps = p_mipmaps; + texture_view = p_texture_view; } }; @@ -190,6 +199,7 @@ public: RID get_texture(const StringName &p_context, const StringName &p_texture_name) const; const RD::TextureFormat get_texture_format(const StringName &p_context, const StringName &p_texture_name) const; RID get_texture_slice(const StringName &p_context, const StringName &p_texture_name, const uint32_t p_layer, const uint32_t p_mipmap, const uint32_t p_layers = 1, const uint32_t p_mipmaps = 1); + RID get_texture_slice_view(const StringName &p_context, const StringName &p_texture_name, const uint32_t p_layer, const uint32_t p_mipmap, const uint32_t p_layers = 1, const uint32_t p_mipmaps = 1, RD::TextureView p_view = RD::TextureView()); Size2i get_texture_slice_size(const StringName &p_context, const StringName &p_texture_name, const uint32_t p_mipmap); void clear_context(const StringName &p_context); @@ -230,6 +240,14 @@ public: _FORCE_INLINE_ RID get_internal_texture(const uint32_t p_layer) { return get_texture_slice(RB_SCOPE_BUFFERS, RB_TEX_COLOR, p_layer, 0); } + _FORCE_INLINE_ RID get_internal_texture_reactive(const uint32_t p_layer) { + RD::TextureView alpha_only_view; + alpha_only_view.swizzle_r = RD::TEXTURE_SWIZZLE_A; + alpha_only_view.swizzle_g = RD::TEXTURE_SWIZZLE_A; + alpha_only_view.swizzle_b = RD::TEXTURE_SWIZZLE_A; + alpha_only_view.swizzle_a = RD::TEXTURE_SWIZZLE_A; + return get_texture_slice_view(RB_SCOPE_BUFFERS, RB_TEX_COLOR, p_layer, 0, 1, 1, alpha_only_view); + } _FORCE_INLINE_ RID get_color_msaa() const { return get_texture(RB_SCOPE_BUFFERS, RB_TEX_COLOR_MSAA); } @@ -251,6 +269,19 @@ public: // back buffer (color) RID get_back_buffer_texture() const { return has_texture(RB_SCOPE_BUFFERS, RB_TEX_BLUR_0) ? get_texture(RB_SCOPE_BUFFERS, RB_TEX_BLUR_0) : RID(); } // We (re)use our blur texture here. + // Upscaled. + void ensure_upscaled(); + + _FORCE_INLINE_ bool has_upscaled_texture() const { + return has_texture(RB_SCOPE_BUFFERS, RB_TEX_COLOR_UPSCALED); + } + _FORCE_INLINE_ RID get_upscaled_texture() const { + return get_texture(RB_SCOPE_BUFFERS, RB_TEX_COLOR_UPSCALED); + } + _FORCE_INLINE_ RID get_upscaled_texture(const uint32_t p_layer) { + return get_texture_slice(RB_SCOPE_BUFFERS, RB_TEX_COLOR_UPSCALED, p_layer, 0); + } + // Velocity, currently only used by TAA (Clustered) but we'll be using this in other places soon too. void ensure_velocity(); @@ -271,6 +302,7 @@ private: RID _create_texture_from_format(const StringName &p_context, const StringName &p_texture_name, const Ref &p_texture_format, const Ref &p_view = Ref(), bool p_unique = true); RID _create_texture_view(const StringName &p_context, const StringName &p_texture_name, const StringName p_view_name, const Ref p_view = Ref()); Ref _get_texture_format(const StringName &p_context, const StringName &p_texture_name) const; + RID _get_texture_slice_view(const StringName &p_context, const StringName &p_texture_name, const uint32_t p_layer, const uint32_t p_mipmap, const uint32_t p_layers = 1, const uint32_t p_mipmaps = 1, const Ref p_view = Ref()); // For color and depth as exposed to extensions, we return the buffer that we're rendering into. // Resolving happens after effects etc. are run. diff --git a/servers/rendering/renderer_rd/storage_rd/render_scene_data_rd.cpp b/servers/rendering/renderer_rd/storage_rd/render_scene_data_rd.cpp index 27c435eeba7..40891f9a631 100644 --- a/servers/rendering/renderer_rd/storage_rd/render_scene_data_rd.cpp +++ b/servers/rendering/renderer_rd/storage_rd/render_scene_data_rd.cpp @@ -38,7 +38,7 @@ RID RenderSceneDataRD::create_uniform_buffer() { return RD::get_singleton()->uniform_buffer_create(sizeof(UBODATA)); } -void RenderSceneDataRD::update_ubo(RID p_uniform_buffer, RS::ViewportDebugDraw p_debug_mode, RID p_env, RID p_reflection_probe_instance, RID p_camera_attributes, bool p_flip_y, bool p_pancake_shadows, const Size2i &p_screen_size, const Color &p_default_bg_color, float p_luminance_multiplier, bool p_opaque_render_buffers) { +void RenderSceneDataRD::update_ubo(RID p_uniform_buffer, RS::ViewportDebugDraw p_debug_mode, RID p_env, RID p_reflection_probe_instance, RID p_camera_attributes, bool p_flip_y, bool p_pancake_shadows, const Size2i &p_screen_size, const Color &p_default_bg_color, float p_luminance_multiplier, bool p_opaque_render_buffers, bool p_apply_alpha_multiplier) { RendererSceneRenderRD *render_scene_render = RendererSceneRenderRD::get_singleton(); UBODATA ubo_data; @@ -89,6 +89,7 @@ void RenderSceneDataRD::update_ubo(RID p_uniform_buffer, RS::ViewportDebugDraw p RendererRD::MaterialStorage::store_soft_shadow_kernel(render_scene_render->penumbra_shadow_kernel_get(), ubo.penumbra_shadow_kernel); RendererRD::MaterialStorage::store_soft_shadow_kernel(render_scene_render->soft_shadow_kernel_get(), ubo.soft_shadow_kernel); ubo.camera_visible_layers = camera_visible_layers; + ubo.pass_alpha_multiplier = p_opaque_render_buffers && p_apply_alpha_multiplier ? 0.0f : 1.0f; ubo.viewport_size[0] = p_screen_size.x; ubo.viewport_size[1] = p_screen_size.y; diff --git a/servers/rendering/renderer_rd/storage_rd/render_scene_data_rd.h b/servers/rendering/renderer_rd/storage_rd/render_scene_data_rd.h index 7546998a9b5..f183207b571 100644 --- a/servers/rendering/renderer_rd/storage_rd/render_scene_data_rd.h +++ b/servers/rendering/renderer_rd/storage_rd/render_scene_data_rd.h @@ -77,7 +77,7 @@ public: float time_step; RID create_uniform_buffer(); - void update_ubo(RID p_uniform_buffer, RS::ViewportDebugDraw p_debug_mode, RID p_env, RID p_reflection_probe_instance, RID p_camera_attributes, bool p_flip_y, bool p_pancake_shadows, const Size2i &p_screen_size, const Color &p_default_bg_color, float p_luminance_multiplier, bool p_opaque_render_buffers); + void update_ubo(RID p_uniform_buffer, RS::ViewportDebugDraw p_debug_mode, RID p_env, RID p_reflection_probe_instance, RID p_camera_attributes, bool p_flip_y, bool p_pancake_shadows, const Size2i &p_screen_size, const Color &p_default_bg_color, float p_luminance_multiplier, bool p_opaque_render_buffers, bool p_apply_alpha_multiplier); RID get_uniform_buffer(); private: @@ -144,7 +144,7 @@ private: uint32_t pancake_shadows; uint32_t camera_visible_layers; - uint32_t pad2; + float pass_alpha_multiplier; uint32_t pad3; }; diff --git a/servers/rendering/renderer_scene_cull.cpp b/servers/rendering/renderer_scene_cull.cpp index 45bbcf51c4d..cf7355945be 100644 --- a/servers/rendering/renderer_scene_cull.cpp +++ b/servers/rendering/renderer_scene_cull.cpp @@ -37,6 +37,21 @@ #include +/* HALTON SEQUENCE */ + +#ifndef _3D_DISABLED +static float get_halton_value(int p_index, int p_base) { + float f = 1; + float r = 0; + while (p_index > 0) { + f = f / static_cast(p_base); + r = r + f * (p_index % p_base); + p_index = p_index / p_base; + } + return r * 2.0f - 1.0f; +} +#endif // _3D_DISABLED + /* CAMERA API */ RID RendererSceneCull::camera_allocate() { @@ -2498,15 +2513,26 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons return animated_material_found; } -void RendererSceneCull::render_camera(const Ref &p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, bool p_use_taa, float p_screen_mesh_lod_threshold, RID p_shadow_atlas, Ref &p_xr_interface, RenderInfo *r_render_info) { +void RendererSceneCull::render_camera(const Ref &p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, uint32_t p_jitter_phase_count, float p_screen_mesh_lod_threshold, RID p_shadow_atlas, Ref &p_xr_interface, RenderInfo *r_render_info) { #ifndef _3D_DISABLED Camera *camera = camera_owner.get_or_null(p_camera); ERR_FAIL_COND(!camera); Vector2 jitter; - if (p_use_taa) { - jitter = taa_jitter_array[RSG::rasterizer->get_frame_number() % TAA_JITTER_COUNT] / p_viewport_size; + if (p_jitter_phase_count > 0) { + uint32_t current_jitter_count = camera_jitter_array.size(); + if (p_jitter_phase_count != current_jitter_count) { + // Resize the jitter array and fill it with the pre-computed Halton sequence. + camera_jitter_array.resize(p_jitter_phase_count); + + for (uint32_t i = current_jitter_count; i < p_jitter_phase_count; i++) { + camera_jitter_array[i].x = get_halton_value(i, 2); + camera_jitter_array[i].y = get_halton_value(i, 3); + } + } + + jitter = camera_jitter_array[RSG::rasterizer->get_frame_number() % p_jitter_phase_count] / p_viewport_size; } RendererSceneRender::CameraData camera_data; @@ -4113,17 +4139,6 @@ void RendererSceneCull::set_scene_render(RendererSceneRender *p_scene_render) { geometry_instance_pair_mask = scene_render->geometry_instance_get_pair_mask(); } -float get_halton_value(int index, int base) { - float f = 1; - float r = 0; - while (index > 0) { - f = f / static_cast(base); - r = r + f * (index % base); - index = index / base; - } - return r * 2.0f - 1.0f; -}; - RendererSceneCull::RendererSceneCull() { render_pass = 1; singleton = this; @@ -4148,12 +4163,6 @@ RendererSceneCull::RendererSceneCull() { thread_cull_threshold = GLOBAL_GET("rendering/limits/spatial_indexer/threaded_cull_minimum_instances"); thread_cull_threshold = MAX(thread_cull_threshold, (uint32_t)WorkerThreadPool::get_singleton()->get_thread_count()); //make sure there is at least one thread per CPU - taa_jitter_array.resize(TAA_JITTER_COUNT); - for (int i = 0; i < TAA_JITTER_COUNT; i++) { - taa_jitter_array[i].x = get_halton_value(i, 2); - taa_jitter_array[i].y = get_halton_value(i, 3); - } - dummy_occlusion_culling = memnew(RendererSceneOcclusionCull); } diff --git a/servers/rendering/renderer_scene_cull.h b/servers/rendering/renderer_scene_cull.h index 29e5ea29fd6..e3e20b85029 100644 --- a/servers/rendering/renderer_scene_cull.h +++ b/servers/rendering/renderer_scene_cull.h @@ -954,8 +954,7 @@ public: uint32_t geometry_instance_pair_mask = 0; // used in traditional forward, unnecessary on clustered - const int TAA_JITTER_COUNT = 16; - LocalVector taa_jitter_array; + LocalVector camera_jitter_array; virtual RID instance_allocate(); virtual void instance_initialize(RID p_rid); @@ -1089,7 +1088,7 @@ public: void _render_scene(const RendererSceneRender::CameraData *p_camera_data, const Ref &p_render_buffers, RID p_environment, RID p_force_camera_attributes, uint32_t p_visible_layers, RID p_scenario, RID p_viewport, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_mesh_lod_threshold, bool p_using_shadows = true, RenderInfo *r_render_info = nullptr); void render_empty_scene(const Ref &p_render_buffers, RID p_scenario, RID p_shadow_atlas); - void render_camera(const Ref &p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, bool p_use_taa, float p_screen_mesh_lod_threshold, RID p_shadow_atlas, Ref &p_xr_interface, RenderingMethod::RenderInfo *r_render_info = nullptr); + void render_camera(const Ref &p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, uint32_t p_jitter_phase_count, float p_screen_mesh_lod_threshold, RID p_shadow_atlas, Ref &p_xr_interface, RenderingMethod::RenderInfo *r_render_info = nullptr); void update_dirty_instances(); void render_particle_colliders(); diff --git a/servers/rendering/renderer_viewport.cpp b/servers/rendering/renderer_viewport.cpp index 8dd591a28ec..09737d03a02 100644 --- a/servers/rendering/renderer_viewport.cpp +++ b/servers/rendering/renderer_viewport.cpp @@ -118,22 +118,29 @@ void RendererViewport::_configure_3d_render_buffers(Viewport *p_viewport) { } else { float scaling_3d_scale = p_viewport->scaling_3d_scale; RS::ViewportScaling3DMode scaling_3d_mode = p_viewport->scaling_3d_mode; + bool scaling_3d_is_fsr = (scaling_3d_mode == RS::VIEWPORT_SCALING_3D_MODE_FSR) || (scaling_3d_mode == RS::VIEWPORT_SCALING_3D_MODE_FSR2); + bool use_taa = p_viewport->use_taa; - if ((scaling_3d_mode == RS::VIEWPORT_SCALING_3D_MODE_FSR) && (scaling_3d_scale > 1.0)) { + if (scaling_3d_is_fsr && (scaling_3d_scale > 1.0)) { // FSR is not designed for downsampling. // Fall back to bilinear scaling. + WARN_PRINT_ONCE("FSR 3D resolution scaling is not designed for downsampling. Falling back to bilinear 3D resolution scaling."); scaling_3d_mode = RS::VIEWPORT_SCALING_3D_MODE_BILINEAR; } - if ((scaling_3d_mode == RS::VIEWPORT_SCALING_3D_MODE_FSR) && !p_viewport->fsr_enabled) { + bool upscaler_available = p_viewport->fsr_enabled; + if (scaling_3d_is_fsr && !upscaler_available) { // FSR is not actually available. // Fall back to bilinear scaling. - WARN_PRINT_ONCE("FSR 1.0 3D resolution scaling is not available. Falling back to bilinear 3D resolution scaling."); + WARN_PRINT_ONCE("FSR 3D resolution scaling is not available. Falling back to bilinear 3D resolution scaling."); scaling_3d_mode = RS::VIEWPORT_SCALING_3D_MODE_BILINEAR; } - if (scaling_3d_scale == 1.0) { - scaling_3d_mode = RS::VIEWPORT_SCALING_3D_MODE_OFF; + if (use_taa && scaling_3d_mode == RS::VIEWPORT_SCALING_3D_MODE_FSR2) { + // FSR2 can't be used with TAA. + // Turn it off and prefer using FSR2. + WARN_PRINT_ONCE("FSR 2 is not compatible with TAA. Disabling TAA internally."); + use_taa = false; } int width; @@ -151,6 +158,7 @@ void RendererViewport::_configure_3d_render_buffers(Viewport *p_viewport) { render_height = height; break; case RS::VIEWPORT_SCALING_3D_MODE_FSR: + case RS::VIEWPORT_SCALING_3D_MODE_FSR2: width = p_viewport->size.width; height = p_viewport->size.height; render_width = MAX(width * scaling_3d_scale, 1.0); // width / (width * scaling) @@ -174,7 +182,17 @@ void RendererViewport::_configure_3d_render_buffers(Viewport *p_viewport) { break; } + uint32_t jitter_phase_count = 0; + if (scaling_3d_mode == RS::VIEWPORT_SCALING_3D_MODE_FSR2) { + // Implementation has been copied from ffxFsr2GetJitterPhaseCount. + jitter_phase_count = uint32_t(8.0f * pow(float(width) / render_width, 2.0f)); + } else if (use_taa) { + // Default jitter count for TAA. + jitter_phase_count = 16; + } + p_viewport->internal_size = Size2(render_width, render_height); + p_viewport->jitter_phase_count = jitter_phase_count; // At resolution scales lower than 1.0, use negative texture mipmap bias // to compensate for the loss of sharpness. @@ -190,7 +208,7 @@ void RendererViewport::_configure_3d_render_buffers(Viewport *p_viewport) { rb_config.set_screen_space_aa(p_viewport->screen_space_aa); rb_config.set_fsr_sharpness(p_viewport->fsr_sharpness); rb_config.set_texture_mipmap_bias(texture_mipmap_bias); - rb_config.set_use_taa(p_viewport->use_taa); + rb_config.set_use_taa(use_taa); p_viewport->render_buffers->configure(&rb_config); } @@ -221,7 +239,7 @@ void RendererViewport::_draw_3d(Viewport *p_viewport) { } float screen_mesh_lod_threshold = p_viewport->mesh_lod_threshold / float(p_viewport->size.width); - RSG::scene->render_camera(p_viewport->render_buffers, p_viewport->camera, p_viewport->scenario, p_viewport->self, p_viewport->internal_size, p_viewport->use_taa, screen_mesh_lod_threshold, p_viewport->shadow_atlas, xr_interface, &p_viewport->render_info); + RSG::scene->render_camera(p_viewport->render_buffers, p_viewport->camera, p_viewport->scenario, p_viewport->self, p_viewport->internal_size, p_viewport->jitter_phase_count, screen_mesh_lod_threshold, p_viewport->shadow_atlas, xr_interface, &p_viewport->render_info); RENDER_TIMESTAMP("< Render 3D Scene"); } @@ -825,8 +843,20 @@ void RendererViewport::viewport_set_use_xr(RID p_viewport, bool p_use_xr) { void RendererViewport::viewport_set_scaling_3d_mode(RID p_viewport, RS::ViewportScaling3DMode p_mode) { Viewport *viewport = viewport_owner.get_or_null(p_viewport); ERR_FAIL_COND(!viewport); + ERR_FAIL_COND_EDMSG(p_mode == RS::VIEWPORT_SCALING_3D_MODE_FSR2 && OS::get_singleton()->get_current_rendering_method() != "forward_plus", "FSR2 is only available when using the Forward+ renderer."); + if (viewport->scaling_3d_mode == p_mode) { + return; + } + + bool motion_vectors_before = _viewport_requires_motion_vectors(viewport); viewport->scaling_3d_mode = p_mode; + + bool motion_vectors_after = _viewport_requires_motion_vectors(viewport); + if (motion_vectors_before != motion_vectors_after) { + num_viewports_with_motion_vectors += motion_vectors_after ? 1 : -1; + } + _configure_3d_render_buffers(viewport); } @@ -888,6 +918,10 @@ void RendererViewport::_viewport_set_size(Viewport *p_viewport, int p_width, int } } +bool RendererViewport::_viewport_requires_motion_vectors(Viewport *p_viewport) { + return p_viewport->use_taa || p_viewport->scaling_3d_mode == RenderingServer::VIEWPORT_SCALING_3D_MODE_FSR2; +} + void RendererViewport::viewport_set_active(RID p_viewport, bool p_active) { Viewport *viewport = viewport_owner.get_or_null(p_viewport); ERR_FAIL_COND(!viewport); @@ -1193,8 +1227,15 @@ void RendererViewport::viewport_set_use_taa(RID p_viewport, bool p_use_taa) { if (viewport->use_taa == p_use_taa) { return; } + + bool motion_vectors_before = _viewport_requires_motion_vectors(viewport); viewport->use_taa = p_use_taa; - num_viewports_with_motion_vectors += p_use_taa ? 1 : -1; + + bool motion_vectors_after = _viewport_requires_motion_vectors(viewport); + if (motion_vectors_before != motion_vectors_after) { + num_viewports_with_motion_vectors += motion_vectors_after ? 1 : -1; + } + _configure_3d_render_buffers(viewport); } @@ -1379,7 +1420,7 @@ bool RendererViewport::free(RID p_rid) { RendererSceneOcclusionCull::get_singleton()->remove_buffer(p_rid); } - if (viewport->use_taa) { + if (_viewport_requires_motion_vectors(viewport)) { num_viewports_with_motion_vectors--; } diff --git a/servers/rendering/renderer_viewport.h b/servers/rendering/renderer_viewport.h index 3bfb1afd51c..44de6d88042 100644 --- a/servers/rendering/renderer_viewport.h +++ b/servers/rendering/renderer_viewport.h @@ -63,6 +63,7 @@ public: float fsr_sharpness = 0.2f; float texture_mipmap_bias = 0.0f; bool fsr_enabled = false; + uint32_t jitter_phase_count = 0; RS::ViewportUpdateMode update_mode = RenderingServer::VIEWPORT_UPDATE_WHEN_VISIBLE; RID render_target; RID render_target_texture; @@ -203,6 +204,7 @@ public: private: Vector _sort_active_viewports(); void _viewport_set_size(Viewport *p_viewport, int p_width, int p_height, uint32_t p_view_count); + bool _viewport_requires_motion_vectors(Viewport *p_viewport); void _configure_3d_render_buffers(Viewport *p_viewport); void _draw_3d(Viewport *p_viewport); void _draw_viewport(Viewport *p_viewport); diff --git a/servers/rendering/rendering_device.h b/servers/rendering/rendering_device.h index 1917276f968..80c5cda6d12 100644 --- a/servers/rendering/rendering_device.h +++ b/servers/rendering/rendering_device.h @@ -518,6 +518,22 @@ public: TextureSwizzle swizzle_b; TextureSwizzle swizzle_a; + bool operator==(const TextureView &p_view) const { + if (format_override != p_view.format_override) { + return false; + } else if (swizzle_r != p_view.swizzle_r) { + return false; + } else if (swizzle_g != p_view.swizzle_g) { + return false; + } else if (swizzle_b != p_view.swizzle_b) { + return false; + } else if (swizzle_a != p_view.swizzle_a) { + return false; + } else { + return true; + } + } + TextureView() { format_override = DATA_FORMAT_MAX; //means, use same as format swizzle_r = TEXTURE_SWIZZLE_R; @@ -1270,6 +1286,8 @@ public: LIMIT_MAX_VIEWPORT_DIMENSIONS_X, LIMIT_MAX_VIEWPORT_DIMENSIONS_Y, LIMIT_SUBGROUP_SIZE, + LIMIT_SUBGROUP_MIN_SIZE, + LIMIT_SUBGROUP_MAX_SIZE, LIMIT_SUBGROUP_IN_SHADERS, // Set flags using SHADER_STAGE_VERTEX_BIT, SHADER_STAGE_FRAGMENT_BIT, etc. LIMIT_SUBGROUP_OPERATIONS, LIMIT_VRS_TEXEL_WIDTH, diff --git a/servers/rendering/rendering_method.h b/servers/rendering/rendering_method.h index f705603a1cb..d1c6c1cbf90 100644 --- a/servers/rendering/rendering_method.h +++ b/servers/rendering/rendering_method.h @@ -301,7 +301,7 @@ public: int info[RS::VIEWPORT_RENDER_INFO_TYPE_MAX][RS::VIEWPORT_RENDER_INFO_MAX] = {}; }; - virtual void render_camera(const Ref &p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, bool p_use_taa, float p_mesh_lod_threshold, RID p_shadow_atlas, Ref &p_xr_interface, RenderInfo *r_render_info = nullptr) = 0; + virtual void render_camera(const Ref &p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, uint32_t p_jitter_phase_count, float p_mesh_lod_threshold, RID p_shadow_atlas, Ref &p_xr_interface, RenderInfo *r_render_info = nullptr) = 0; virtual void update() = 0; virtual void render_probes() = 0; diff --git a/servers/rendering/storage/render_scene_buffers.cpp b/servers/rendering/storage/render_scene_buffers.cpp index 19496c0116d..96e64928545 100644 --- a/servers/rendering/storage/render_scene_buffers.cpp +++ b/servers/rendering/storage/render_scene_buffers.cpp @@ -49,7 +49,7 @@ void RenderSceneBuffersConfiguration::_bind_methods() { ClassDB::bind_method(D_METHOD("get_scaling_3d_mode"), &RenderSceneBuffersConfiguration::get_scaling_3d_mode); ClassDB::bind_method(D_METHOD("set_scaling_3d_mode", "scaling_3d_mode"), &RenderSceneBuffersConfiguration::set_scaling_3d_mode); - ADD_PROPERTY(PropertyInfo(Variant::INT, "scaling_3d_mode", PROPERTY_HINT_ENUM, "Bilinear (Fastest),FSR 1.0 (Fast)"), "set_scaling_3d_mode", "get_scaling_3d_mode"); // TODO VIEWPORT_SCALING_3D_MODE_OFF is possible here too, but we can't specify an enum string for it. + ADD_PROPERTY(PropertyInfo(Variant::INT, "scaling_3d_mode", PROPERTY_HINT_ENUM, "Bilinear (Fastest),FSR 1.0 (Fast),FSR 2.2 (Slow)"), "set_scaling_3d_mode", "get_scaling_3d_mode"); // TODO VIEWPORT_SCALING_3D_MODE_OFF is possible here too, but we can't specify an enum string for it. ClassDB::bind_method(D_METHOD("get_msaa_3d"), &RenderSceneBuffersConfiguration::get_msaa_3d); ClassDB::bind_method(D_METHOD("set_msaa_3d", "msaa_3d"), &RenderSceneBuffersConfiguration::set_msaa_3d); diff --git a/servers/rendering_server.cpp b/servers/rendering_server.cpp index 45ba0b3c089..b7c40600cb3 100644 --- a/servers/rendering_server.cpp +++ b/servers/rendering_server.cpp @@ -2252,6 +2252,7 @@ void RenderingServer::_bind_methods() { BIND_ENUM_CONSTANT(VIEWPORT_SCALING_3D_MODE_BILINEAR); BIND_ENUM_CONSTANT(VIEWPORT_SCALING_3D_MODE_FSR); + BIND_ENUM_CONSTANT(VIEWPORT_SCALING_3D_MODE_FSR2); BIND_ENUM_CONSTANT(VIEWPORT_SCALING_3D_MODE_MAX); BIND_ENUM_CONSTANT(VIEWPORT_UPDATE_DISABLED); @@ -2329,6 +2330,7 @@ void RenderingServer::_bind_methods() { BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_CLUSTER_REFLECTION_PROBES); BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_OCCLUDERS); BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_MOTION_VECTORS); + BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_INTERNAL_BUFFER); BIND_ENUM_CONSTANT(VIEWPORT_VRS_DISABLED); BIND_ENUM_CONSTANT(VIEWPORT_VRS_TEXTURE); @@ -2959,7 +2961,7 @@ void RenderingServer::init() { GLOBAL_DEF(PropertyInfo(Variant::FLOAT, "rendering/anti_aliasing/screen_space_roughness_limiter/amount", PROPERTY_HINT_RANGE, "0.01,4.0,0.01"), 0.25); GLOBAL_DEF(PropertyInfo(Variant::FLOAT, "rendering/anti_aliasing/screen_space_roughness_limiter/limit", PROPERTY_HINT_RANGE, "0.01,1.0,0.01"), 0.18); - GLOBAL_DEF(PropertyInfo(Variant::INT, "rendering/scaling_3d/mode", PROPERTY_HINT_ENUM, "Bilinear (Fastest),FSR 1.0 (Fast)"), 0); + GLOBAL_DEF(PropertyInfo(Variant::INT, "rendering/scaling_3d/mode", PROPERTY_HINT_ENUM, "Bilinear (Fastest),FSR 1.0 (Fast),FSR 2.2 (Slow)"), 0); GLOBAL_DEF(PropertyInfo(Variant::FLOAT, "rendering/scaling_3d/scale", PROPERTY_HINT_RANGE, "0.25,2.0,0.01"), 1.0); GLOBAL_DEF(PropertyInfo(Variant::FLOAT, "rendering/scaling_3d/fsr_sharpness", PROPERTY_HINT_RANGE, "0,2,0.1"), 0.2f); GLOBAL_DEF(PropertyInfo(Variant::FLOAT, "rendering/textures/default_filters/texture_mipmap_bias", PROPERTY_HINT_RANGE, "-2,2,0.001"), 0.0f); diff --git a/servers/rendering_server.h b/servers/rendering_server.h index 1528a957ce4..6b2ba562ce9 100644 --- a/servers/rendering_server.h +++ b/servers/rendering_server.h @@ -807,6 +807,7 @@ public: enum ViewportScaling3DMode { VIEWPORT_SCALING_3D_MODE_BILINEAR, VIEWPORT_SCALING_3D_MODE_FSR, + VIEWPORT_SCALING_3D_MODE_FSR2, VIEWPORT_SCALING_3D_MODE_MAX, VIEWPORT_SCALING_3D_MODE_OFF = 255, // for internal use only }; @@ -971,6 +972,7 @@ public: VIEWPORT_DEBUG_DRAW_CLUSTER_REFLECTION_PROBES, VIEWPORT_DEBUG_DRAW_OCCLUDERS, VIEWPORT_DEBUG_DRAW_MOTION_VECTORS, + VIEWPORT_DEBUG_DRAW_INTERNAL_BUFFER, }; virtual void viewport_set_debug_draw(RID p_viewport, ViewportDebugDraw p_draw) = 0; diff --git a/thirdparty/README.md b/thirdparty/README.md index 87d9d8e32ca..1eb95a1a7c2 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -17,6 +17,21 @@ Files extracted from upstream source: - `license.txt` +## amd-fsr2 + +- Upstream: https://github.com/GPUOpen-Effects/FidelityFX-FSR2 +- Version: 2.2.1 (1680d1edd5c034f88ebbbb793d8b88f8842cf804, 2023) +- License: MIT + +Files extracted from upstream source: + +- `ffx_*.cpp` and `ffx_*.h` from `src/ffx-fsr2-api` +- `shaders` folder from `src/ffx-fsr2-api` with `ffx_*.hlsl` files excluded +- `LICENSE.txt` + +Apply `patches` to add the new options required by Godot and general compilation fixes. + + ## angle - Upstream: https://chromium.googlesource.com/angle/angle/ diff --git a/thirdparty/amd-fsr2/LICENSE.txt b/thirdparty/amd-fsr2/LICENSE.txt new file mode 100644 index 00000000000..c066ae10636 --- /dev/null +++ b/thirdparty/amd-fsr2/LICENSE.txt @@ -0,0 +1,21 @@ +FidelityFX Super Resolution 2.2 +================================= +Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/thirdparty/amd-fsr2/ffx_assert.cpp b/thirdparty/amd-fsr2/ffx_assert.cpp new file mode 100644 index 00000000000..8a70ad501a1 --- /dev/null +++ b/thirdparty/amd-fsr2/ffx_assert.cpp @@ -0,0 +1,81 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "ffx_assert.h" +#include // for malloc() + +#ifdef _WIN32 +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include // required for OutputDebugString() +#include // required for sprintf_s +#endif // #ifndef _WIN32 + +static FfxAssertCallback s_assertCallback; + +// set the printing callback function +void ffxAssertSetPrintingCallback(FfxAssertCallback callback) +{ + s_assertCallback = callback; + return; +} + +// implementation of assert reporting +bool ffxAssertReport(const char* file, int32_t line, const char* condition, const char* message) +{ + if (!file) { + + return true; + } + +#ifdef _WIN32 + // form the final assertion string and output to the TTY. + const size_t bufferSize = static_cast(snprintf(nullptr, 0, "%s(%d): ASSERTION FAILED. %s\n", file, line, message ? message : condition)) + 1; + char* tempBuf = static_cast(malloc(bufferSize)); + if (!tempBuf) { + + return true; + } + + if (!message) { + sprintf_s(tempBuf, bufferSize, "%s(%d): ASSERTION FAILED. %s\n", file, line, condition); + } else { + sprintf_s(tempBuf, bufferSize, "%s(%d): ASSERTION FAILED. %s\n", file, line, message); + } + + if (!s_assertCallback) { + OutputDebugStringA(tempBuf); + } else { + s_assertCallback(tempBuf); + } + + // free the buffer. + free(tempBuf); + +#else + FFX_UNUSED(line); + FFX_UNUSED(condition); + FFX_UNUSED(message); +#endif + + return true; +} diff --git a/thirdparty/amd-fsr2/ffx_assert.h b/thirdparty/amd-fsr2/ffx_assert.h new file mode 100644 index 00000000000..ae32d2a7334 --- /dev/null +++ b/thirdparty/amd-fsr2/ffx_assert.h @@ -0,0 +1,132 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "ffx_types.h" +#include "ffx_util.h" + +#ifdef __cplusplus +extern "C" { +#endif // #ifdef __cplusplus + +#ifdef _DEBUG +#ifdef _WIN32 + +#ifdef DISABLE_FFX_DEBUG_BREAK +#define FFX_DEBUG_BREAK \ + { \ + } +#else +/// Macro to force the debugger to break at this point in the code. +#define FFX_DEBUG_BREAK __debugbreak(); +#endif +#else +#define FFX_DEBUG_BREAK \ + { \ + } +#endif +#else +// don't allow debug break in release builds. +#define FFX_DEBUG_BREAK +#endif + +/// A typedef for the callback function for assert printing. +/// +/// This can be used to re-route printing of assert messages from the FFX backend +/// to another destination. For example instead of the default behaviour of printing +/// the assert messages to the debugger's TTY the message can be re-routed to a +/// MessageBox in a GUI application. +/// +/// @param [in] message The message generated by the assert. +/// +typedef void (*FfxAssertCallback)(const char* message); + +/// Function to report an assert. +/// +/// @param [in] file The name of the file as a string. +/// @param [in] line The index of the line in the file. +/// @param [in] condition The boolean condition that was tested. +/// @param [in] msg The optional message to print. +/// +/// @returns +/// Always returns true. +/// +FFX_API bool ffxAssertReport(const char* file, int32_t line, const char* condition, const char* msg); + +/// Provides the ability to set a callback for assert messages. +/// +/// @param [in] callback The callback function that will receive assert messages. +/// +FFX_API void ffxAssertSetPrintingCallback(FfxAssertCallback callback); + +#ifdef _DEBUG +/// Standard assert macro. +#define FFX_ASSERT(condition) \ + do \ + { \ + if (!(condition) && ffxAssertReport(__FILE__, __LINE__, #condition, NULL)) \ + FFX_DEBUG_BREAK \ + } while (0) + +/// Assert macro with message. +#define FFX_ASSERT_MESSAGE(condition, msg) \ + do \ + { \ + if (!(condition) && ffxAssertReport(__FILE__, __LINE__, #condition, msg)) \ + FFX_DEBUG_BREAK \ + } while (0) + +/// Assert macro that always fails. +#define FFX_ASSERT_FAIL(message) \ + do \ + { \ + ffxAssertReport(__FILE__, __LINE__, NULL, message); \ + FFX_DEBUG_BREAK \ + } while (0) +#else +// asserts disabled +#define FFX_ASSERT(condition) \ + do \ + { \ + FFX_UNUSED(condition); \ + } while (0) + +#define FFX_ASSERT_MESSAGE(condition, message) \ + do \ + { \ + FFX_UNUSED(condition); \ + FFX_UNUSED(message); \ + } while (0) + +#define FFX_ASSERT_FAIL(message) \ + do \ + { \ + FFX_UNUSED(message); \ + } while (0) +#endif // #if _DEBUG + +/// Simple static assert. +#define FFX_STATIC_ASSERT(condition) static_assert(condition, #condition) + +#ifdef __cplusplus +} +#endif // #ifdef __cplusplus diff --git a/thirdparty/amd-fsr2/ffx_error.h b/thirdparty/amd-fsr2/ffx_error.h new file mode 100644 index 00000000000..7ba7d9c4ea4 --- /dev/null +++ b/thirdparty/amd-fsr2/ffx_error.h @@ -0,0 +1,59 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "ffx_types.h" + +/// Typedef for error codes returned from functions in the FidelityFX SDK. +typedef int32_t FfxErrorCode; + +static const FfxErrorCode FFX_OK = 0; ///< The operation completed successfully. +static const FfxErrorCode FFX_ERROR_INVALID_POINTER = 0x80000000; ///< The operation failed due to an invalid pointer. +static const FfxErrorCode FFX_ERROR_INVALID_ALIGNMENT = 0x80000001; ///< The operation failed due to an invalid alignment. +static const FfxErrorCode FFX_ERROR_INVALID_SIZE = 0x80000002; ///< The operation failed due to an invalid size. +static const FfxErrorCode FFX_EOF = 0x80000003; ///< The end of the file was encountered. +static const FfxErrorCode FFX_ERROR_INVALID_PATH = 0x80000004; ///< The operation failed because the specified path was invalid. +static const FfxErrorCode FFX_ERROR_EOF = 0x80000005; ///< The operation failed because end of file was reached. +static const FfxErrorCode FFX_ERROR_MALFORMED_DATA = 0x80000006; ///< The operation failed because of some malformed data. +static const FfxErrorCode FFX_ERROR_OUT_OF_MEMORY = 0x80000007; ///< The operation failed because it ran out memory. +static const FfxErrorCode FFX_ERROR_INCOMPLETE_INTERFACE = 0x80000008; ///< The operation failed because the interface was not fully configured. +static const FfxErrorCode FFX_ERROR_INVALID_ENUM = 0x80000009; ///< The operation failed because of an invalid enumeration value. +static const FfxErrorCode FFX_ERROR_INVALID_ARGUMENT = 0x8000000a; ///< The operation failed because an argument was invalid. +static const FfxErrorCode FFX_ERROR_OUT_OF_RANGE = 0x8000000b; ///< The operation failed because a value was out of range. +static const FfxErrorCode FFX_ERROR_NULL_DEVICE = 0x8000000c; ///< The operation failed because a device was null. +static const FfxErrorCode FFX_ERROR_BACKEND_API_ERROR = 0x8000000d; ///< The operation failed because the backend API returned an error code. +static const FfxErrorCode FFX_ERROR_INSUFFICIENT_MEMORY = 0x8000000e; ///< The operation failed because there was not enough memory. + +/// Helper macro to return error code y from a function when a specific condition, x, is not met. +#define FFX_RETURN_ON_ERROR(x, y) \ + if (!(x)) \ + { \ + return (y); \ + } + +/// Helper macro to return error code x from a function when it is not FFX_OK. +#define FFX_VALIDATE(x) \ + { \ + FfxErrorCode ret = x; \ + FFX_RETURN_ON_ERROR(ret == FFX_OK, ret); \ + } + diff --git a/thirdparty/amd-fsr2/ffx_fsr2.cpp b/thirdparty/amd-fsr2/ffx_fsr2.cpp new file mode 100644 index 00000000000..864f7f12942 --- /dev/null +++ b/thirdparty/amd-fsr2/ffx_fsr2.cpp @@ -0,0 +1,1373 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include // for max used inside SPD CPU code. +#include // for fabs, abs, sinf, sqrt, etc. +#include // for memset +#include // for FLT_EPSILON +#include "ffx_fsr2.h" +#define FFX_CPU +#include "shaders/ffx_core.h" +#include "shaders/ffx_fsr1.h" +#include "shaders/ffx_spd.h" +#include "shaders/ffx_fsr2_callbacks_hlsl.h" + +#include "ffx_fsr2_maximum_bias.h" + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wunused-variable" +#endif + +// -- GODOT start -- +#ifndef _countof +#define _countof(array) (sizeof(array) / sizeof(array[0])) +#endif + +#ifndef _MSC_VER +#include +#define wcscpy_s wcscpy +#endif +// -- GODOT end -- + +// max queued frames for descriptor management +static const uint32_t FSR2_MAX_QUEUED_FRAMES = 16; + +#include "ffx_fsr2_private.h" + +// lists to map shader resource bindpoint name to resource identifier +typedef struct ResourceBinding +{ + uint32_t index; + wchar_t name[64]; +}ResourceBinding; + +static const ResourceBinding srvResourceBindingTable[] = +{ + {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_COLOR, L"r_input_color_jittered"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_OPAQUE_ONLY, L"r_input_opaque_only"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_MOTION_VECTORS, L"r_input_motion_vectors"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_DEPTH, L"r_input_depth" }, + {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_EXPOSURE, L"r_input_exposure"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE, L"r_auto_exposure"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_REACTIVE_MASK, L"r_reactive_mask"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_TRANSPARENCY_AND_COMPOSITION_MASK, L"r_transparency_and_composition_mask"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH, L"r_reconstructed_previous_nearest_depth"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS, L"r_dilated_motion_vectors"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_PREVIOUS_DILATED_MOTION_VECTORS, L"r_previous_dilated_motion_vectors"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH, L"r_dilatedDepth"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR, L"r_internal_upscaled_color"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS, L"r_lock_status"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR, L"r_prepared_input_color"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY, L"r_luma_history" }, + {FFX_FSR2_RESOURCE_IDENTIFIER_RCAS_INPUT, L"r_rcas_input"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_LANCZOS_LUT, L"r_lanczos_lut"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE, L"r_imgMips"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_SHADING_CHANGE, L"r_img_mip_shading_change"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_5, L"r_img_mip_5"}, + {FFX_FSR2_RESOURCE_IDENTITIER_UPSAMPLE_MAXIMUM_BIAS_LUT, L"r_upsample_maximum_bias_lut"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS, L"r_dilated_reactive_masks"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_NEW_LOCKS, L"r_new_locks"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_INPUT_LUMA, L"r_lock_input_luma"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR, L"r_input_prev_color_pre_alpha"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR, L"r_input_prev_color_post_alpha"}, +}; + +static const ResourceBinding uavResourceBindingTable[] = +{ + {FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH, L"rw_reconstructed_previous_nearest_depth"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS, L"rw_dilated_motion_vectors"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH, L"rw_dilatedDepth"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR, L"rw_internal_upscaled_color"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS, L"rw_lock_status"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR, L"rw_prepared_input_color"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY, L"rw_luma_history"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_UPSCALED_OUTPUT, L"rw_upscaled_output"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_SHADING_CHANGE, L"rw_img_mip_shading_change"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_5, L"rw_img_mip_5"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS, L"rw_dilated_reactive_masks"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE, L"rw_auto_exposure"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_SPD_ATOMIC_COUNT, L"rw_spd_global_atomic"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_NEW_LOCKS, L"rw_new_locks"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_INPUT_LUMA, L"rw_lock_input_luma"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_AUTOREACTIVE, L"rw_output_autoreactive"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_AUTOCOMPOSITION, L"rw_output_autocomposition"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR, L"rw_output_prev_color_pre_alpha"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR, L"rw_output_prev_color_post_alpha"}, +}; + +static const ResourceBinding cbResourceBindingTable[] = +{ + {FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_FSR2, L"cbFSR2"}, + {FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_SPD, L"cbSPD"}, + {FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_RCAS, L"cbRCAS"}, + {FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_GENREACTIVE, L"cbGenerateReactive"}, +}; + +// Broad structure of the root signature. +typedef enum Fsr2RootSignatureLayout { + + FSR2_ROOT_SIGNATURE_LAYOUT_UAVS, + FSR2_ROOT_SIGNATURE_LAYOUT_SRVS, + FSR2_ROOT_SIGNATURE_LAYOUT_CONSTANTS, + FSR2_ROOT_SIGNATURE_LAYOUT_CONSTANTS_REGISTER_1, + FSR2_ROOT_SIGNATURE_LAYOUT_PARAMETER_COUNT +} Fsr2RootSignatureLayout; + +typedef struct Fsr2RcasConstants { + + uint32_t rcasConfig[4]; +} FfxRcasConstants; + +typedef struct Fsr2SpdConstants { + + uint32_t mips; + uint32_t numworkGroups; + uint32_t workGroupOffset[2]; + uint32_t renderSize[2]; +} Fsr2SpdConstants; + +typedef struct Fsr2GenerateReactiveConstants +{ + float scale; + float threshold; + float binaryValue; + uint32_t flags; + +} Fsr2GenerateReactiveConstants; + +typedef struct Fsr2GenerateReactiveConstants2 +{ + float autoTcThreshold; + float autoTcScale; + float autoReactiveScale; + float autoReactiveMax; + +} Fsr2GenerateReactiveConstants2; + +typedef union Fsr2SecondaryUnion { + + Fsr2RcasConstants rcas; + Fsr2SpdConstants spd; + Fsr2GenerateReactiveConstants2 autogenReactive; +} Fsr2SecondaryUnion; + +typedef struct Fsr2ResourceDescription { + + uint32_t id; + const wchar_t* name; + FfxResourceUsage usage; + FfxSurfaceFormat format; + uint32_t width; + uint32_t height; + uint32_t mipCount; + FfxResourceFlags flags; + uint32_t initDataSize; + void* initData; +} Fsr2ResourceDescription; + +FfxConstantBuffer globalFsr2ConstantBuffers[4] = { + { sizeof(Fsr2Constants) / sizeof(uint32_t) }, + { sizeof(Fsr2SpdConstants) / sizeof(uint32_t) }, + { sizeof(Fsr2RcasConstants) / sizeof(uint32_t) }, + { sizeof(Fsr2GenerateReactiveConstants) / sizeof(uint32_t) } +}; + +// Lanczos +static float lanczos2(float value) +{ + return abs(value) < FFX_EPSILON ? 1.f : (sinf(FFX_PI * value) / (FFX_PI * value)) * (sinf(0.5f * FFX_PI * value) / (0.5f * FFX_PI * value)); +} + +// Calculate halton number for index and base. +static float halton(int32_t index, int32_t base) +{ + float f = 1.0f, result = 0.0f; + + for (int32_t currentIndex = index; currentIndex > 0;) { + + f /= (float)base; + result = result + f * (float)(currentIndex % base); + currentIndex = (uint32_t)(floorf((float)(currentIndex) / (float)(base))); + } + + return result; +} + +static void fsr2DebugCheckDispatch(FfxFsr2Context_Private* context, const FfxFsr2DispatchDescription* params) +{ + if (params->commandList == nullptr) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_ERROR, L"commandList is null"); + } + + if (params->color.resource == nullptr) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_ERROR, L"color resource is null"); + } + + if (params->depth.resource == nullptr) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_ERROR, L"depth resource is null"); + } + + if (params->motionVectors.resource == nullptr) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_ERROR, L"motionVectors resource is null"); + } + + if (params->exposure.resource != nullptr) + { + if ((context->contextDescription.flags & FFX_FSR2_ENABLE_AUTO_EXPOSURE) == FFX_FSR2_ENABLE_AUTO_EXPOSURE) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_WARNING, L"exposure resource provided, however auto exposure flag is present"); + } + } + + if (params->output.resource == nullptr) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_ERROR, L"output resource is null"); + } + + if (fabs(params->jitterOffset.x) > 1.0f || fabs(params->jitterOffset.y) > 1.0f) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_WARNING, L"jitterOffset contains value outside of expected range [-1.0, 1.0]"); + } + + if ((params->motionVectorScale.x > (float)context->contextDescription.maxRenderSize.width) || + (params->motionVectorScale.y > (float)context->contextDescription.maxRenderSize.height)) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_WARNING, L"motionVectorScale contains scale value greater than maxRenderSize"); + } + if ((params->motionVectorScale.x == 0.0f) || + (params->motionVectorScale.y == 0.0f)) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_WARNING, L"motionVectorScale contains zero scale value"); + } + + if ((params->renderSize.width > context->contextDescription.maxRenderSize.width) || + (params->renderSize.height > context->contextDescription.maxRenderSize.height)) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_WARNING, L"renderSize is greater than context maxRenderSize"); + } + if ((params->renderSize.width == 0) || + (params->renderSize.height == 0)) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_WARNING, L"renderSize contains zero dimension"); + } + + if (params->sharpness < 0.0f || params->sharpness > 1.0f) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_WARNING, L"sharpness contains value outside of expected range [0.0, 1.0]"); + } + + if (params->frameTimeDelta < 1.0f) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_WARNING, L"frameTimeDelta is less than 1.0f - this value should be milliseconds (~16.6f for 60fps)"); + } + + if (params->preExposure == 0.0f) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_ERROR, L"preExposure provided as 0.0f which is invalid"); + } + + bool infiniteDepth = (context->contextDescription.flags & FFX_FSR2_ENABLE_DEPTH_INFINITE) == FFX_FSR2_ENABLE_DEPTH_INFINITE; + bool inverseDepth = (context->contextDescription.flags & FFX_FSR2_ENABLE_DEPTH_INVERTED) == FFX_FSR2_ENABLE_DEPTH_INVERTED; + + if (inverseDepth) + { + if (params->cameraNear < params->cameraFar) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_WARNING, + L"FFX_FSR2_ENABLE_DEPTH_INVERTED flag is present yet cameraNear is less than cameraFar"); + } + if (infiniteDepth) + { + if (params->cameraNear != FLT_MAX) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_WARNING, + L"FFX_FSR2_ENABLE_DEPTH_INFINITE and FFX_FSR2_ENABLE_DEPTH_INVERTED present, yet cameraNear != FLT_MAX"); + } + } + if (params->cameraFar < 0.075f) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_WARNING, + L"FFX_FSR2_ENABLE_DEPTH_INFINITE and FFX_FSR2_ENABLE_DEPTH_INVERTED present, cameraFar value is very low which may result in depth separation artefacting"); + } + } + else + { + if (params->cameraNear > params->cameraFar) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_WARNING, + L"cameraNear is greater than cameraFar in non-inverted-depth context"); + } + if (infiniteDepth) + { + if (params->cameraFar != FLT_MAX) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_WARNING, + L"FFX_FSR2_ENABLE_DEPTH_INFINITE and FFX_FSR2_ENABLE_DEPTH_INVERTED present, yet cameraFar != FLT_MAX"); + } + } + if (params->cameraNear < 0.075f) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_WARNING, + L"FFX_FSR2_ENABLE_DEPTH_INFINITE and FFX_FSR2_ENABLE_DEPTH_INVERTED present, cameraNear value is very low which may result in depth separation artefacting"); + } + } + + if (params->cameraFovAngleVertical <= 0.0f) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_ERROR, L"cameraFovAngleVertical is 0.0f - this value should be > 0.0f"); + } + if (params->cameraFovAngleVertical > FFX_PI) + { + context->contextDescription.fpMessage(FFX_FSR2_MESSAGE_TYPE_ERROR, L"cameraFovAngleVertical is greater than 180 degrees/PI"); + } +} + +static FfxErrorCode patchResourceBindings(FfxPipelineState* inoutPipeline) +{ + for (uint32_t srvIndex = 0; srvIndex < inoutPipeline->srvCount; ++srvIndex) + { + int32_t mapIndex = 0; + for (mapIndex = 0; mapIndex < _countof(srvResourceBindingTable); ++mapIndex) + { + if (0 == wcscmp(srvResourceBindingTable[mapIndex].name, inoutPipeline->srvResourceBindings[srvIndex].name)) + break; + } + if (mapIndex == _countof(srvResourceBindingTable)) + return FFX_ERROR_INVALID_ARGUMENT; + + inoutPipeline->srvResourceBindings[srvIndex].resourceIdentifier = srvResourceBindingTable[mapIndex].index; + } + + for (uint32_t uavIndex = 0; uavIndex < inoutPipeline->uavCount; ++uavIndex) + { + int32_t mapIndex = 0; + for (mapIndex = 0; mapIndex < _countof(uavResourceBindingTable); ++mapIndex) + { + if (0 == wcscmp(uavResourceBindingTable[mapIndex].name, inoutPipeline->uavResourceBindings[uavIndex].name)) + break; + } + if (mapIndex == _countof(uavResourceBindingTable)) + return FFX_ERROR_INVALID_ARGUMENT; + + inoutPipeline->uavResourceBindings[uavIndex].resourceIdentifier = uavResourceBindingTable[mapIndex].index; + } + + for (uint32_t cbIndex = 0; cbIndex < inoutPipeline->constCount; ++cbIndex) + { + int32_t mapIndex = 0; + for (mapIndex = 0; mapIndex < _countof(cbResourceBindingTable); ++mapIndex) + { + if (0 == wcscmp(cbResourceBindingTable[mapIndex].name, inoutPipeline->cbResourceBindings[cbIndex].name)) + break; + } + if (mapIndex == _countof(cbResourceBindingTable)) + return FFX_ERROR_INVALID_ARGUMENT; + + inoutPipeline->cbResourceBindings[cbIndex].resourceIdentifier = cbResourceBindingTable[mapIndex].index; + } + + return FFX_OK; +} + + +static FfxErrorCode createPipelineStates(FfxFsr2Context_Private* context) +{ + FFX_ASSERT(context); + + const size_t samplerCount = 2; + FfxFilterType samplers[samplerCount]; + samplers[0] = FFX_FILTER_TYPE_POINT; + samplers[1] = FFX_FILTER_TYPE_LINEAR; + + const size_t rootConstantCount = 2; + uint32_t rootConstants[rootConstantCount]; + rootConstants[0] = sizeof(Fsr2Constants) / sizeof(uint32_t); + rootConstants[1] = sizeof(Fsr2SecondaryUnion) / sizeof(uint32_t); + + FfxPipelineDescription pipelineDescription; + pipelineDescription.contextFlags = context->contextDescription.flags; + pipelineDescription.samplerCount = samplerCount; + pipelineDescription.samplers = samplers; + pipelineDescription.rootConstantBufferCount = rootConstantCount; + pipelineDescription.rootConstantBufferSizes = rootConstants; + + // New interface: will handle RootSignature in backend + // set up pipeline descriptor (basically RootSignature and binding) + FFX_VALIDATE(context->contextDescription.callbacks.fpCreatePipeline(&context->contextDescription.callbacks, FFX_FSR2_PASS_COMPUTE_LUMINANCE_PYRAMID, &pipelineDescription, &context->pipelineComputeLuminancePyramid)); + FFX_VALIDATE(context->contextDescription.callbacks.fpCreatePipeline(&context->contextDescription.callbacks, FFX_FSR2_PASS_RCAS, &pipelineDescription, &context->pipelineRCAS)); + FFX_VALIDATE(context->contextDescription.callbacks.fpCreatePipeline(&context->contextDescription.callbacks, FFX_FSR2_PASS_GENERATE_REACTIVE, &pipelineDescription, &context->pipelineGenerateReactive)); + FFX_VALIDATE(context->contextDescription.callbacks.fpCreatePipeline(&context->contextDescription.callbacks, FFX_FSR2_PASS_TCR_AUTOGENERATE, &pipelineDescription, &context->pipelineTcrAutogenerate)); + + pipelineDescription.rootConstantBufferCount = 1; + FFX_VALIDATE(context->contextDescription.callbacks.fpCreatePipeline(&context->contextDescription.callbacks, FFX_FSR2_PASS_DEPTH_CLIP, &pipelineDescription, &context->pipelineDepthClip)); + FFX_VALIDATE(context->contextDescription.callbacks.fpCreatePipeline(&context->contextDescription.callbacks, FFX_FSR2_PASS_RECONSTRUCT_PREVIOUS_DEPTH, &pipelineDescription, &context->pipelineReconstructPreviousDepth)); + FFX_VALIDATE(context->contextDescription.callbacks.fpCreatePipeline(&context->contextDescription.callbacks, FFX_FSR2_PASS_LOCK, &pipelineDescription, &context->pipelineLock)); + FFX_VALIDATE(context->contextDescription.callbacks.fpCreatePipeline(&context->contextDescription.callbacks, FFX_FSR2_PASS_ACCUMULATE, &pipelineDescription, &context->pipelineAccumulate)); + FFX_VALIDATE(context->contextDescription.callbacks.fpCreatePipeline(&context->contextDescription.callbacks, FFX_FSR2_PASS_ACCUMULATE_SHARPEN, &pipelineDescription, &context->pipelineAccumulateSharpen)); + + // for each pipeline: re-route/fix-up IDs based on names + patchResourceBindings(&context->pipelineDepthClip); + patchResourceBindings(&context->pipelineReconstructPreviousDepth); + patchResourceBindings(&context->pipelineLock); + patchResourceBindings(&context->pipelineAccumulate); + patchResourceBindings(&context->pipelineComputeLuminancePyramid); + patchResourceBindings(&context->pipelineAccumulateSharpen); + patchResourceBindings(&context->pipelineRCAS); + patchResourceBindings(&context->pipelineGenerateReactive); + patchResourceBindings(&context->pipelineTcrAutogenerate); + + return FFX_OK; +} + +static FfxErrorCode generateReactiveMaskInternal(FfxFsr2Context_Private* contextPrivate, const FfxFsr2DispatchDescription* params); + +static FfxErrorCode fsr2Create(FfxFsr2Context_Private* context, const FfxFsr2ContextDescription* contextDescription) +{ + FFX_ASSERT(context); + FFX_ASSERT(contextDescription); + + // Setup the data for implementation. + memset(context, 0, sizeof(FfxFsr2Context_Private)); + context->device = contextDescription->device; + + memcpy(&context->contextDescription, contextDescription, sizeof(FfxFsr2ContextDescription)); + + if ((context->contextDescription.flags & FFX_FSR2_ENABLE_DEBUG_CHECKING) == FFX_FSR2_ENABLE_DEBUG_CHECKING) + { + if (context->contextDescription.fpMessage == nullptr) + { + FFX_ASSERT(context->contextDescription.fpMessage != nullptr); + // remove the debug checking flag - we have no message function + context->contextDescription.flags &= ~FFX_FSR2_ENABLE_DEBUG_CHECKING; + } + } + + // Create the device. + FfxErrorCode errorCode = context->contextDescription.callbacks.fpCreateBackendContext(&context->contextDescription.callbacks, context->device); + FFX_RETURN_ON_ERROR(errorCode == FFX_OK, errorCode); + + // call out for device caps. + errorCode = context->contextDescription.callbacks.fpGetDeviceCapabilities(&context->contextDescription.callbacks, &context->deviceCapabilities, context->device); + FFX_RETURN_ON_ERROR(errorCode == FFX_OK, errorCode); + + // set defaults + context->firstExecution = true; + context->resourceFrameIndex = 0; + + context->constants.displaySize[0] = contextDescription->displaySize.width; + context->constants.displaySize[1] = contextDescription->displaySize.height; + + // generate the data for the LUT. + const uint32_t lanczos2LutWidth = 128; + int16_t lanczos2Weights[lanczos2LutWidth] = { }; + + for (uint32_t currentLanczosWidthIndex = 0; currentLanczosWidthIndex < lanczos2LutWidth; currentLanczosWidthIndex++) { + + const float x = 2.0f * currentLanczosWidthIndex / float(lanczos2LutWidth - 1); + const float y = lanczos2(x); + lanczos2Weights[currentLanczosWidthIndex] = int16_t(roundf(y * 32767.0f)); + } + + // upload path only supports R16_SNORM, let's go and convert + int16_t maximumBias[FFX_FSR2_MAXIMUM_BIAS_TEXTURE_WIDTH * FFX_FSR2_MAXIMUM_BIAS_TEXTURE_HEIGHT]; + for (uint32_t i = 0; i < FFX_FSR2_MAXIMUM_BIAS_TEXTURE_WIDTH * FFX_FSR2_MAXIMUM_BIAS_TEXTURE_HEIGHT; ++i) { + + maximumBias[i] = int16_t(roundf(ffxFsr2MaximumBias[i] / 2.0f * 32767.0f)); + } + + uint8_t defaultReactiveMaskData = 0U; + uint32_t atomicInitData = 0U; + float defaultExposure[] = { 0.0f, 0.0f }; + const FfxResourceType texture1dResourceType = (context->contextDescription.flags & FFX_FSR2_ENABLE_TEXTURE1D_USAGE) ? FFX_RESOURCE_TYPE_TEXTURE1D : FFX_RESOURCE_TYPE_TEXTURE2D; + + // declare internal resources needed + const Fsr2ResourceDescription internalSurfaceDesc[] = { + + { FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR, L"FSR2_PreparedInputColor", FFX_RESOURCE_USAGE_UAV, + FFX_SURFACE_FORMAT_R16G16B16A16_FLOAT, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_ALIASABLE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH, L"FSR2_ReconstructedPrevNearestDepth", FFX_RESOURCE_USAGE_UAV, + FFX_SURFACE_FORMAT_R32_UINT, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_ALIASABLE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DILATED_MOTION_VECTORS_1, L"FSR2_InternalDilatedVelocity1", (FfxResourceUsage)(FFX_RESOURCE_USAGE_RENDERTARGET | FFX_RESOURCE_USAGE_UAV), + FFX_SURFACE_FORMAT_R16G16_FLOAT, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_NONE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DILATED_MOTION_VECTORS_2, L"FSR2_InternalDilatedVelocity2", (FfxResourceUsage)(FFX_RESOURCE_USAGE_RENDERTARGET | FFX_RESOURCE_USAGE_UAV), + FFX_SURFACE_FORMAT_R16G16_FLOAT, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_NONE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH, L"FSR2_DilatedDepth", (FfxResourceUsage)(FFX_RESOURCE_USAGE_RENDERTARGET | FFX_RESOURCE_USAGE_UAV), + FFX_SURFACE_FORMAT_R32_FLOAT, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_ALIASABLE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS_1, L"FSR2_LockStatus1", (FfxResourceUsage)(FFX_RESOURCE_USAGE_RENDERTARGET | FFX_RESOURCE_USAGE_UAV), + FFX_SURFACE_FORMAT_R16G16_FLOAT, contextDescription->displaySize.width, contextDescription->displaySize.height, 1, FFX_RESOURCE_FLAGS_NONE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS_2, L"FSR2_LockStatus2", (FfxResourceUsage)(FFX_RESOURCE_USAGE_RENDERTARGET | FFX_RESOURCE_USAGE_UAV), + FFX_SURFACE_FORMAT_R16G16_FLOAT, contextDescription->displaySize.width, contextDescription->displaySize.height, 1, FFX_RESOURCE_FLAGS_NONE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_INPUT_LUMA, L"FSR2_LockInputLuma", (FfxResourceUsage)(FFX_RESOURCE_USAGE_UAV), + FFX_SURFACE_FORMAT_R16_FLOAT, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_ALIASABLE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_NEW_LOCKS, L"FSR2_NewLocks", (FfxResourceUsage)(FFX_RESOURCE_USAGE_UAV), + FFX_SURFACE_FORMAT_R8_UNORM, contextDescription->displaySize.width, contextDescription->displaySize.height, 1, FFX_RESOURCE_FLAGS_ALIASABLE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR_1, L"FSR2_InternalUpscaled1", (FfxResourceUsage)(FFX_RESOURCE_USAGE_RENDERTARGET | FFX_RESOURCE_USAGE_UAV), + FFX_SURFACE_FORMAT_R16G16B16A16_FLOAT, contextDescription->displaySize.width, contextDescription->displaySize.height, 1, FFX_RESOURCE_FLAGS_NONE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR_2, L"FSR2_InternalUpscaled2", (FfxResourceUsage)(FFX_RESOURCE_USAGE_RENDERTARGET | FFX_RESOURCE_USAGE_UAV), + FFX_SURFACE_FORMAT_R16G16B16A16_FLOAT, contextDescription->displaySize.width, contextDescription->displaySize.height, 1, FFX_RESOURCE_FLAGS_NONE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE, L"FSR2_ExposureMips", FFX_RESOURCE_USAGE_UAV, + FFX_SURFACE_FORMAT_R16_FLOAT, contextDescription->maxRenderSize.width / 2, contextDescription->maxRenderSize.height / 2, 0, FFX_RESOURCE_FLAGS_ALIASABLE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY_1, L"FSR2_LumaHistory1", (FfxResourceUsage)(FFX_RESOURCE_USAGE_RENDERTARGET | FFX_RESOURCE_USAGE_UAV), + FFX_SURFACE_FORMAT_R8G8B8A8_UNORM, contextDescription->displaySize.width, contextDescription->displaySize.height, 1, FFX_RESOURCE_FLAGS_NONE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY_2, L"FSR2_LumaHistory2", (FfxResourceUsage)(FFX_RESOURCE_USAGE_RENDERTARGET | FFX_RESOURCE_USAGE_UAV), + FFX_SURFACE_FORMAT_R8G8B8A8_UNORM, contextDescription->displaySize.width, contextDescription->displaySize.height, 1, FFX_RESOURCE_FLAGS_NONE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_SPD_ATOMIC_COUNT, L"FSR2_SpdAtomicCounter", (FfxResourceUsage)(FFX_RESOURCE_USAGE_UAV), + FFX_SURFACE_FORMAT_R32_UINT, 1, 1, 1, FFX_RESOURCE_FLAGS_ALIASABLE, sizeof(atomicInitData), &atomicInitData }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS, L"FSR2_DilatedReactiveMasks", FFX_RESOURCE_USAGE_UAV, + FFX_SURFACE_FORMAT_R8G8_UNORM, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_ALIASABLE }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_LANCZOS_LUT, L"FSR2_LanczosLutData", FFX_RESOURCE_USAGE_READ_ONLY, + FFX_SURFACE_FORMAT_R16_SNORM, lanczos2LutWidth, 1, 1, FFX_RESOURCE_FLAGS_NONE, sizeof(lanczos2Weights), lanczos2Weights }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DEFAULT_REACTIVITY, L"FSR2_DefaultReactiviyMask", FFX_RESOURCE_USAGE_READ_ONLY, + FFX_SURFACE_FORMAT_R8_UNORM, 1, 1, 1, FFX_RESOURCE_FLAGS_NONE, sizeof(defaultReactiveMaskData), &defaultReactiveMaskData }, + + { FFX_FSR2_RESOURCE_IDENTITIER_UPSAMPLE_MAXIMUM_BIAS_LUT, L"FSR2_MaximumUpsampleBias", FFX_RESOURCE_USAGE_READ_ONLY, + FFX_SURFACE_FORMAT_R16_SNORM, FFX_FSR2_MAXIMUM_BIAS_TEXTURE_WIDTH, FFX_FSR2_MAXIMUM_BIAS_TEXTURE_HEIGHT, 1, FFX_RESOURCE_FLAGS_NONE, sizeof(maximumBias), maximumBias }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DEFAULT_EXPOSURE, L"FSR2_DefaultExposure", FFX_RESOURCE_USAGE_READ_ONLY, + FFX_SURFACE_FORMAT_R32G32_FLOAT, 1, 1, 1, FFX_RESOURCE_FLAGS_NONE, sizeof(defaultExposure), defaultExposure }, + + { FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE, L"FSR2_AutoExposure", FFX_RESOURCE_USAGE_UAV, + FFX_SURFACE_FORMAT_R32G32_FLOAT, 1, 1, 1, FFX_RESOURCE_FLAGS_NONE }, + + + // only one for now, will need pingpont to respect the motion vectors + { FFX_FSR2_RESOURCE_IDENTIFIER_AUTOREACTIVE, L"FSR2_AutoReactive", FFX_RESOURCE_USAGE_UAV, + FFX_SURFACE_FORMAT_R8_UNORM, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_NONE }, + { FFX_FSR2_RESOURCE_IDENTIFIER_AUTOCOMPOSITION, L"FSR2_AutoComposition", FFX_RESOURCE_USAGE_UAV, + FFX_SURFACE_FORMAT_R8_UNORM, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_NONE }, + { FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR_1, L"FSR2_PrevPreAlpha0", FFX_RESOURCE_USAGE_UAV, + FFX_SURFACE_FORMAT_R11G11B10_FLOAT, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_NONE }, + { FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR_1, L"FSR2_PrevPostAlpha0", FFX_RESOURCE_USAGE_UAV, + FFX_SURFACE_FORMAT_R11G11B10_FLOAT, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_NONE }, + { FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR_2, L"FSR2_PrevPreAlpha1", FFX_RESOURCE_USAGE_UAV, + FFX_SURFACE_FORMAT_R11G11B10_FLOAT, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_NONE }, + { FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR_2, L"FSR2_PrevPostAlpha1", FFX_RESOURCE_USAGE_UAV, + FFX_SURFACE_FORMAT_R11G11B10_FLOAT, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_NONE }, + + }; + + // clear the SRV resources to NULL. + memset(context->srvResources, 0, sizeof(context->srvResources)); + + for (int32_t currentSurfaceIndex = 0; currentSurfaceIndex < FFX_ARRAY_ELEMENTS(internalSurfaceDesc); ++currentSurfaceIndex) { + + const Fsr2ResourceDescription* currentSurfaceDescription = &internalSurfaceDesc[currentSurfaceIndex]; + const FfxResourceType resourceType = currentSurfaceDescription->height > 1 ? FFX_RESOURCE_TYPE_TEXTURE2D : texture1dResourceType; + const FfxResourceDescription resourceDescription = { resourceType, currentSurfaceDescription->format, currentSurfaceDescription->width, currentSurfaceDescription->height, 1, currentSurfaceDescription->mipCount }; + const FfxResourceStates initialState = (currentSurfaceDescription->usage == FFX_RESOURCE_USAGE_READ_ONLY) ? FFX_RESOURCE_STATE_COMPUTE_READ : FFX_RESOURCE_STATE_UNORDERED_ACCESS; + const FfxCreateResourceDescription createResourceDescription = { FFX_HEAP_TYPE_DEFAULT, resourceDescription, initialState, currentSurfaceDescription->initDataSize, currentSurfaceDescription->initData, currentSurfaceDescription->name, currentSurfaceDescription->usage, currentSurfaceDescription->id }; + + FFX_VALIDATE(context->contextDescription.callbacks.fpCreateResource(&context->contextDescription.callbacks, &createResourceDescription, &context->srvResources[currentSurfaceDescription->id])); + } + + // copy resources to uavResrouces list + memcpy(context->uavResources, context->srvResources, sizeof(context->srvResources)); + + // avoid compiling pipelines on first render + { + context->refreshPipelineStates = false; + errorCode = createPipelineStates(context); + FFX_RETURN_ON_ERROR(errorCode == FFX_OK, errorCode); + } + return FFX_OK; +} + +static void fsr2SafeReleasePipeline(FfxFsr2Context_Private* context, FfxPipelineState* pipeline) +{ + FFX_ASSERT(pipeline); + + context->contextDescription.callbacks.fpDestroyPipeline(&context->contextDescription.callbacks, pipeline); +} + +static void fsr2SafeReleaseResource(FfxFsr2Context_Private* context, FfxResourceInternal resource) +{ + context->contextDescription.callbacks.fpDestroyResource(&context->contextDescription.callbacks, resource); +} + +static void fsr2SafeReleaseDevice(FfxFsr2Context_Private* context, FfxDevice* device) +{ + if (*device == nullptr) { + return; + } + + context->contextDescription.callbacks.fpDestroyBackendContext(&context->contextDescription.callbacks); + *device = nullptr; +} + +static FfxErrorCode fsr2Release(FfxFsr2Context_Private* context) +{ + FFX_ASSERT(context); + + fsr2SafeReleasePipeline(context, &context->pipelineDepthClip); + fsr2SafeReleasePipeline(context, &context->pipelineReconstructPreviousDepth); + fsr2SafeReleasePipeline(context, &context->pipelineLock); + fsr2SafeReleasePipeline(context, &context->pipelineAccumulate); + fsr2SafeReleasePipeline(context, &context->pipelineAccumulateSharpen); + fsr2SafeReleasePipeline(context, &context->pipelineRCAS); + fsr2SafeReleasePipeline(context, &context->pipelineComputeLuminancePyramid); + fsr2SafeReleasePipeline(context, &context->pipelineGenerateReactive); + fsr2SafeReleasePipeline(context, &context->pipelineTcrAutogenerate); + + // unregister resources not created internally + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_OPAQUE_ONLY] = { FFX_FSR2_RESOURCE_IDENTIFIER_NULL }; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_COLOR] = { FFX_FSR2_RESOURCE_IDENTIFIER_NULL }; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_DEPTH] = { FFX_FSR2_RESOURCE_IDENTIFIER_NULL }; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_MOTION_VECTORS] = { FFX_FSR2_RESOURCE_IDENTIFIER_NULL }; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_EXPOSURE] = { FFX_FSR2_RESOURCE_IDENTIFIER_NULL }; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_REACTIVE_MASK] = { FFX_FSR2_RESOURCE_IDENTIFIER_NULL }; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_TRANSPARENCY_AND_COMPOSITION_MASK] = { FFX_FSR2_RESOURCE_IDENTIFIER_NULL }; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS] = { FFX_FSR2_RESOURCE_IDENTIFIER_NULL }; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR] = { FFX_FSR2_RESOURCE_IDENTIFIER_NULL }; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_RCAS_INPUT] = { FFX_FSR2_RESOURCE_IDENTIFIER_NULL }; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_UPSCALED_OUTPUT] = { FFX_FSR2_RESOURCE_IDENTIFIER_NULL }; + + // release internal resources + for (int32_t currentResourceIndex = 0; currentResourceIndex < FFX_FSR2_RESOURCE_IDENTIFIER_COUNT; ++currentResourceIndex) { + + fsr2SafeReleaseResource(context, context->srvResources[currentResourceIndex]); + } + + fsr2SafeReleaseDevice(context, &context->device); + + return FFX_OK; +} + +static void setupDeviceDepthToViewSpaceDepthParams(FfxFsr2Context_Private* context, const FfxFsr2DispatchDescription* params) +{ + const bool bInverted = (context->contextDescription.flags & FFX_FSR2_ENABLE_DEPTH_INVERTED) == FFX_FSR2_ENABLE_DEPTH_INVERTED; + const bool bInfinite = (context->contextDescription.flags & FFX_FSR2_ENABLE_DEPTH_INFINITE) == FFX_FSR2_ENABLE_DEPTH_INFINITE; + + // make sure it has no impact if near and far plane values are swapped in dispatch params + // the flags "inverted" and "infinite" will decide what transform to use + float fMin = FFX_MINIMUM(params->cameraNear, params->cameraFar); + float fMax = FFX_MAXIMUM(params->cameraNear, params->cameraFar); + + if (bInverted) { + float tmp = fMin; + fMin = fMax; + fMax = tmp; + } + + // a 0 0 0 x + // 0 b 0 0 y + // 0 0 c d z + // 0 0 e 0 1 + + const float fQ = fMax / (fMin - fMax); + const float d = -1.0f; // for clarity + + const float matrix_elem_c[2][2] = { + fQ, // non reversed, non infinite + -1.0f - FLT_EPSILON, // non reversed, infinite + fQ, // reversed, non infinite + 0.0f + FLT_EPSILON // reversed, infinite + }; + + const float matrix_elem_e[2][2] = { + fQ * fMin, // non reversed, non infinite + -fMin - FLT_EPSILON, // non reversed, infinite + fQ * fMin, // reversed, non infinite + fMax, // reversed, infinite + }; + + context->constants.deviceToViewDepth[0] = d * matrix_elem_c[bInverted][bInfinite]; + context->constants.deviceToViewDepth[1] = matrix_elem_e[bInverted][bInfinite]; + + // revert x and y coords + const float aspect = params->renderSize.width / float(params->renderSize.height); + const float cotHalfFovY = cosf(0.5f * params->cameraFovAngleVertical) / sinf(0.5f * params->cameraFovAngleVertical); + const float a = cotHalfFovY / aspect; + const float b = cotHalfFovY; + + context->constants.deviceToViewDepth[2] = (1.0f / a); + context->constants.deviceToViewDepth[3] = (1.0f / b); +} + +static void scheduleDispatch(FfxFsr2Context_Private* context, const FfxFsr2DispatchDescription* params, const FfxPipelineState* pipeline, uint32_t dispatchX, uint32_t dispatchY) +{ + FfxComputeJobDescription jobDescriptor = {}; + + for (uint32_t currentShaderResourceViewIndex = 0; currentShaderResourceViewIndex < pipeline->srvCount; ++currentShaderResourceViewIndex) { + + const uint32_t currentResourceId = pipeline->srvResourceBindings[currentShaderResourceViewIndex].resourceIdentifier; + const FfxResourceInternal currentResource = context->srvResources[currentResourceId]; + jobDescriptor.srvs[currentShaderResourceViewIndex] = currentResource; + wcscpy_s(jobDescriptor.srvNames[currentShaderResourceViewIndex], pipeline->srvResourceBindings[currentShaderResourceViewIndex].name); + } + + for (uint32_t currentUnorderedAccessViewIndex = 0; currentUnorderedAccessViewIndex < pipeline->uavCount; ++currentUnorderedAccessViewIndex) { + + const uint32_t currentResourceId = pipeline->uavResourceBindings[currentUnorderedAccessViewIndex].resourceIdentifier; + wcscpy_s(jobDescriptor.uavNames[currentUnorderedAccessViewIndex], pipeline->uavResourceBindings[currentUnorderedAccessViewIndex].name); + + if (currentResourceId >= FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_0 && currentResourceId <= FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_12) + { + const FfxResourceInternal currentResource = context->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE]; + jobDescriptor.uavs[currentUnorderedAccessViewIndex] = currentResource; + jobDescriptor.uavMip[currentUnorderedAccessViewIndex] = currentResourceId - FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_0; + } + else + { + const FfxResourceInternal currentResource = context->uavResources[currentResourceId]; + jobDescriptor.uavs[currentUnorderedAccessViewIndex] = currentResource; + jobDescriptor.uavMip[currentUnorderedAccessViewIndex] = 0; + } + } + + jobDescriptor.dimensions[0] = dispatchX; + jobDescriptor.dimensions[1] = dispatchY; + jobDescriptor.dimensions[2] = 1; + jobDescriptor.pipeline = *pipeline; + + for (uint32_t currentRootConstantIndex = 0; currentRootConstantIndex < pipeline->constCount; ++currentRootConstantIndex) { + wcscpy_s( jobDescriptor.cbNames[currentRootConstantIndex], pipeline->cbResourceBindings[currentRootConstantIndex].name); + jobDescriptor.cbs[currentRootConstantIndex] = globalFsr2ConstantBuffers[pipeline->cbResourceBindings[currentRootConstantIndex].resourceIdentifier]; + jobDescriptor.cbSlotIndex[currentRootConstantIndex] = pipeline->cbResourceBindings[currentRootConstantIndex].slotIndex; + } + + FfxGpuJobDescription dispatchJob = { FFX_GPU_JOB_COMPUTE }; + dispatchJob.computeJobDescriptor = jobDescriptor; + + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &dispatchJob); +} + +static FfxErrorCode fsr2Dispatch(FfxFsr2Context_Private* context, const FfxFsr2DispatchDescription* params) +{ + if ((context->contextDescription.flags & FFX_FSR2_ENABLE_DEBUG_CHECKING) == FFX_FSR2_ENABLE_DEBUG_CHECKING) + { + fsr2DebugCheckDispatch(context, params); + } + // take a short cut to the command list + FfxCommandList commandList = params->commandList; + + // try and refresh shaders first. Early exit in case of error. + if (context->refreshPipelineStates) { + + context->refreshPipelineStates = false; + + const FfxErrorCode errorCode = createPipelineStates(context); + FFX_RETURN_ON_ERROR(errorCode == FFX_OK, errorCode); + } + + if (context->firstExecution) + { + FfxGpuJobDescription clearJob = { FFX_GPU_JOB_CLEAR_FLOAT }; + + const float clearValuesToZeroFloat[]{ 0.f, 0.f, 0.f, 0.f }; + memcpy(clearJob.clearJobDescriptor.color, clearValuesToZeroFloat, 4 * sizeof(float)); + + clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS_1]; + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); + clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS_2]; + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); + clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR]; + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); + } + + // Prepare per frame descriptor tables + const bool isOddFrame = !!(context->resourceFrameIndex & 1); + const uint32_t currentCpuOnlyTableBase = isOddFrame ? FFX_FSR2_RESOURCE_IDENTIFIER_COUNT : 0; + const uint32_t currentGpuTableBase = 2 * FFX_FSR2_RESOURCE_IDENTIFIER_COUNT * context->resourceFrameIndex; + const uint32_t lockStatusSrvResourceIndex = isOddFrame ? FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS_2 : FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS_1; + const uint32_t lockStatusUavResourceIndex = isOddFrame ? FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS_1 : FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS_2; + const uint32_t upscaledColorSrvResourceIndex = isOddFrame ? FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR_2 : FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR_1; + const uint32_t upscaledColorUavResourceIndex = isOddFrame ? FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR_1 : FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR_2; + const uint32_t dilatedMotionVectorsResourceIndex = isOddFrame ? FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DILATED_MOTION_VECTORS_2 : FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DILATED_MOTION_VECTORS_1; + const uint32_t previousDilatedMotionVectorsResourceIndex = isOddFrame ? FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DILATED_MOTION_VECTORS_1 : FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DILATED_MOTION_VECTORS_2; + const uint32_t lumaHistorySrvResourceIndex = isOddFrame ? FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY_2 : FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY_1; + const uint32_t lumaHistoryUavResourceIndex = isOddFrame ? FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY_1 : FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY_2; + + const uint32_t prevPreAlphaColorSrvResourceIndex = isOddFrame ? FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR_2 : FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR_1; + const uint32_t prevPreAlphaColorUavResourceIndex = isOddFrame ? FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR_1 : FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR_2; + const uint32_t prevPostAlphaColorSrvResourceIndex = isOddFrame ? FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR_2 : FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR_1; + const uint32_t prevPostAlphaColorUavResourceIndex = isOddFrame ? FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR_1 : FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR_2; + + const bool resetAccumulation = params->reset || context->firstExecution; + context->firstExecution = false; + + context->contextDescription.callbacks.fpRegisterResource(&context->contextDescription.callbacks, ¶ms->color, &context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_COLOR]); + context->contextDescription.callbacks.fpRegisterResource(&context->contextDescription.callbacks, ¶ms->depth, &context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_DEPTH]); + context->contextDescription.callbacks.fpRegisterResource(&context->contextDescription.callbacks, ¶ms->motionVectors, &context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_MOTION_VECTORS]); + + // if auto exposure is enabled use the auto exposure SRV, otherwise what the app sends. + if (context->contextDescription.flags & FFX_FSR2_ENABLE_AUTO_EXPOSURE) { + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_EXPOSURE] = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE]; + } else { + if (ffxFsr2ResourceIsNull(params->exposure)) { + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_EXPOSURE] = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DEFAULT_EXPOSURE]; + } else { + context->contextDescription.callbacks.fpRegisterResource(&context->contextDescription.callbacks, ¶ms->exposure, &context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_EXPOSURE]); + } + } + + if (params->enableAutoReactive) + { + context->contextDescription.callbacks.fpRegisterResource(&context->contextDescription.callbacks, ¶ms->colorOpaqueOnly, &context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR]); + } + + if (ffxFsr2ResourceIsNull(params->reactive)) { + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_REACTIVE_MASK] = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DEFAULT_REACTIVITY]; + } + else { + context->contextDescription.callbacks.fpRegisterResource(&context->contextDescription.callbacks, ¶ms->reactive, &context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_REACTIVE_MASK]); + } + + if (ffxFsr2ResourceIsNull(params->transparencyAndComposition)) { + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_TRANSPARENCY_AND_COMPOSITION_MASK] = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DEFAULT_REACTIVITY]; + } else { + context->contextDescription.callbacks.fpRegisterResource(&context->contextDescription.callbacks, ¶ms->transparencyAndComposition, &context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_TRANSPARENCY_AND_COMPOSITION_MASK]); + } + + context->contextDescription.callbacks.fpRegisterResource(&context->contextDescription.callbacks, ¶ms->output, &context->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_UPSCALED_OUTPUT]); + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS] = context->srvResources[lockStatusSrvResourceIndex]; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR] = context->srvResources[upscaledColorSrvResourceIndex]; + context->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS] = context->uavResources[lockStatusUavResourceIndex]; + context->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR] = context->uavResources[upscaledColorUavResourceIndex]; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_RCAS_INPUT] = context->uavResources[upscaledColorUavResourceIndex]; + + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS] = context->srvResources[dilatedMotionVectorsResourceIndex]; + context->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS] = context->uavResources[dilatedMotionVectorsResourceIndex]; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_PREVIOUS_DILATED_MOTION_VECTORS] = context->srvResources[previousDilatedMotionVectorsResourceIndex]; + + context->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY] = context->uavResources[lumaHistoryUavResourceIndex]; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY] = context->srvResources[lumaHistorySrvResourceIndex]; + + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR] = context->srvResources[prevPreAlphaColorSrvResourceIndex]; + context->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR] = context->uavResources[prevPreAlphaColorUavResourceIndex]; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR] = context->srvResources[prevPostAlphaColorSrvResourceIndex]; + context->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR] = context->uavResources[prevPostAlphaColorUavResourceIndex]; + + // actual resource size may differ from render/display resolution (e.g. due to Hw/API restrictions), so query the descriptor for UVs adjustment + const FfxResourceDescription resourceDescInputColor = context->contextDescription.callbacks.fpGetResourceDescription(&context->contextDescription.callbacks, context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_COLOR]); + const FfxResourceDescription resourceDescLockStatus = context->contextDescription.callbacks.fpGetResourceDescription(&context->contextDescription.callbacks, context->srvResources[lockStatusSrvResourceIndex]); + const FfxResourceDescription resourceDescReactiveMask = context->contextDescription.callbacks.fpGetResourceDescription(&context->contextDescription.callbacks, context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_REACTIVE_MASK]); + FFX_ASSERT(resourceDescInputColor.type == FFX_RESOURCE_TYPE_TEXTURE2D); + FFX_ASSERT(resourceDescLockStatus.type == FFX_RESOURCE_TYPE_TEXTURE2D); + + context->constants.jitterOffset[0] = params->jitterOffset.x; + context->constants.jitterOffset[1] = params->jitterOffset.y; + context->constants.renderSize[0] = int32_t(params->renderSize.width ? params->renderSize.width : resourceDescInputColor.width); + context->constants.renderSize[1] = int32_t(params->renderSize.height ? params->renderSize.height : resourceDescInputColor.height); + context->constants.maxRenderSize[0] = int32_t(context->contextDescription.maxRenderSize.width); + context->constants.maxRenderSize[1] = int32_t(context->contextDescription.maxRenderSize.height); + context->constants.inputColorResourceDimensions[0] = resourceDescInputColor.width; + context->constants.inputColorResourceDimensions[1] = resourceDescInputColor.height; + + // compute the horizontal FOV for the shader from the vertical one. + const float aspectRatio = (float)params->renderSize.width / (float)params->renderSize.height; + const float cameraAngleHorizontal = atan(tan(params->cameraFovAngleVertical / 2) * aspectRatio) * 2; + context->constants.tanHalfFOV = tanf(cameraAngleHorizontal * 0.5f); + context->constants.viewSpaceToMetersFactor = (params->viewSpaceToMetersFactor > 0.0f) ? params->viewSpaceToMetersFactor : 1.0f; + + // compute params to enable device depth to view space depth computation in shader + setupDeviceDepthToViewSpaceDepthParams(context, params); + + // To be updated if resource is larger than the actual image size + context->constants.downscaleFactor[0] = float(context->constants.renderSize[0]) / context->contextDescription.displaySize.width; + context->constants.downscaleFactor[1] = float(context->constants.renderSize[1]) / context->contextDescription.displaySize.height; + context->constants.previousFramePreExposure = context->constants.preExposure; + context->constants.preExposure = (params->preExposure != 0) ? params->preExposure : 1.0f; + + // motion vector data + const int32_t* motionVectorsTargetSize = (context->contextDescription.flags & FFX_FSR2_ENABLE_DISPLAY_RESOLUTION_MOTION_VECTORS) ? context->constants.displaySize : context->constants.renderSize; + + context->constants.motionVectorScale[0] = (params->motionVectorScale.x / motionVectorsTargetSize[0]); + context->constants.motionVectorScale[1] = (params->motionVectorScale.y / motionVectorsTargetSize[1]); + + // compute jitter cancellation + if (context->contextDescription.flags & FFX_FSR2_ENABLE_MOTION_VECTORS_JITTER_CANCELLATION) { + + context->constants.motionVectorJitterCancellation[0] = (context->previousJitterOffset[0] - context->constants.jitterOffset[0]) / motionVectorsTargetSize[0]; + context->constants.motionVectorJitterCancellation[1] = (context->previousJitterOffset[1] - context->constants.jitterOffset[1]) / motionVectorsTargetSize[1]; + + context->previousJitterOffset[0] = context->constants.jitterOffset[0]; + context->previousJitterOffset[1] = context->constants.jitterOffset[1]; + } + + // lock data, assuming jitter sequence length computation for now + const int32_t jitterPhaseCount = ffxFsr2GetJitterPhaseCount(params->renderSize.width, context->contextDescription.displaySize.width); + + // init on first frame + if (resetAccumulation || context->constants.jitterPhaseCount == 0) { + context->constants.jitterPhaseCount = (float)jitterPhaseCount; + } else { + const int32_t jitterPhaseCountDelta = (int32_t)(jitterPhaseCount - context->constants.jitterPhaseCount); + if (jitterPhaseCountDelta > 0) { + context->constants.jitterPhaseCount++; + } else if (jitterPhaseCountDelta < 0) { + context->constants.jitterPhaseCount--; + } + } + + // convert delta time to seconds and clamp to [0, 1]. + context->constants.deltaTime = FFX_MAXIMUM(0.0f, FFX_MINIMUM(1.0f, params->frameTimeDelta / 1000.0f)); + + if (resetAccumulation) { + context->constants.frameIndex = 0; + } else { + context->constants.frameIndex++; + } + + // shading change usage of the SPD mip levels. + context->constants.lumaMipLevelToUse = uint32_t(FFX_FSR2_SHADING_CHANGE_MIP_LEVEL); + + const float mipDiv = float(2 << context->constants.lumaMipLevelToUse); + context->constants.lumaMipDimensions[0] = uint32_t(context->constants.maxRenderSize[0] / mipDiv); + context->constants.lumaMipDimensions[1] = uint32_t(context->constants.maxRenderSize[1] / mipDiv); + + // -- GODOT start -- + memcpy(context->constants.reprojectionMatrix, params->reprojectionMatrix, sizeof(context->constants.reprojectionMatrix)); + // -- GODOT end -- + + // reactive mask bias + const int32_t threadGroupWorkRegionDim = 8; + const int32_t dispatchSrcX = (context->constants.renderSize[0] + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + const int32_t dispatchSrcY = (context->constants.renderSize[1] + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + const int32_t dispatchDstX = (context->contextDescription.displaySize.width + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + const int32_t dispatchDstY = (context->contextDescription.displaySize.height + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + + // Clear reconstructed depth for max depth store. + if (resetAccumulation) { + + FfxGpuJobDescription clearJob = { FFX_GPU_JOB_CLEAR_FLOAT }; + + // LockStatus resource has no sign bit, callback functions are compensating for this. + // Clearing the resource must follow the same logic. + float clearValuesLockStatus[4]{}; + clearValuesLockStatus[LOCK_LIFETIME_REMAINING] = 0.0f; + clearValuesLockStatus[LOCK_TEMPORAL_LUMA] = 0.0f; + + memcpy(clearJob.clearJobDescriptor.color, clearValuesLockStatus, 4 * sizeof(float)); + clearJob.clearJobDescriptor.target = context->srvResources[lockStatusSrvResourceIndex]; + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); + + const float clearValuesToZeroFloat[]{ 0.f, 0.f, 0.f, 0.f }; + memcpy(clearJob.clearJobDescriptor.color, clearValuesToZeroFloat, 4 * sizeof(float)); + clearJob.clearJobDescriptor.target = context->srvResources[upscaledColorSrvResourceIndex]; + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); + + clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE]; + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); + + //if (context->contextDescription.flags & FFX_FSR2_ENABLE_AUTO_EXPOSURE) + // Auto exposure always used to track luma changes in locking logic + { + const float clearValuesExposure[]{ -1.f, 1e8f, 0.f, 0.f }; + memcpy(clearJob.clearJobDescriptor.color, clearValuesExposure, 4 * sizeof(float)); + clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE]; + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); + } + } + + // Auto exposure + uint32_t dispatchThreadGroupCountXY[2]; + uint32_t workGroupOffset[2]; + uint32_t numWorkGroupsAndMips[2]; + uint32_t rectInfo[4] = { 0, 0, params->renderSize.width, params->renderSize.height }; + SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo); + + // downsample + Fsr2SpdConstants luminancePyramidConstants; + luminancePyramidConstants.numworkGroups = numWorkGroupsAndMips[0]; + luminancePyramidConstants.mips = numWorkGroupsAndMips[1]; + luminancePyramidConstants.workGroupOffset[0] = workGroupOffset[0]; + luminancePyramidConstants.workGroupOffset[1] = workGroupOffset[1]; + luminancePyramidConstants.renderSize[0] = params->renderSize.width; + luminancePyramidConstants.renderSize[1] = params->renderSize.height; + + // compute the constants. + Fsr2RcasConstants rcasConsts = {}; + const float sharpenessRemapped = (-2.0f * params->sharpness) + 2.0f; + FsrRcasCon(rcasConsts.rcasConfig, sharpenessRemapped); + + Fsr2GenerateReactiveConstants2 genReactiveConsts = {}; + genReactiveConsts.autoTcThreshold = params->autoTcThreshold; + genReactiveConsts.autoTcScale = params->autoTcScale; + genReactiveConsts.autoReactiveScale = params->autoReactiveScale; + genReactiveConsts.autoReactiveMax = params->autoReactiveMax; + + // initialize constantBuffers data + memcpy(&globalFsr2ConstantBuffers[FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_FSR2].data, &context->constants, globalFsr2ConstantBuffers[FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_FSR2].uint32Size * sizeof(uint32_t)); + memcpy(&globalFsr2ConstantBuffers[FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_SPD].data, &luminancePyramidConstants, globalFsr2ConstantBuffers[FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_SPD].uint32Size * sizeof(uint32_t)); + memcpy(&globalFsr2ConstantBuffers[FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_RCAS].data, &rcasConsts, globalFsr2ConstantBuffers[FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_RCAS].uint32Size * sizeof(uint32_t)); + memcpy(&globalFsr2ConstantBuffers[FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_GENREACTIVE].data, &genReactiveConsts, globalFsr2ConstantBuffers[FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_GENREACTIVE].uint32Size * sizeof(uint32_t)); + + // Auto reactive + if (params->enableAutoReactive) + { + generateReactiveMaskInternal(context, params); + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_REACTIVE_MASK] = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_AUTOREACTIVE]; + context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_TRANSPARENCY_AND_COMPOSITION_MASK] = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_AUTOCOMPOSITION]; + } + scheduleDispatch(context, params, &context->pipelineComputeLuminancePyramid, dispatchThreadGroupCountXY[0], dispatchThreadGroupCountXY[1]); + scheduleDispatch(context, params, &context->pipelineReconstructPreviousDepth, dispatchSrcX, dispatchSrcY); + scheduleDispatch(context, params, &context->pipelineDepthClip, dispatchSrcX, dispatchSrcY); + + const bool sharpenEnabled = params->enableSharpening; + + scheduleDispatch(context, params, &context->pipelineLock, dispatchSrcX, dispatchSrcY); + scheduleDispatch(context, params, sharpenEnabled ? &context->pipelineAccumulateSharpen : &context->pipelineAccumulate, dispatchDstX, dispatchDstY); + + // RCAS + if (sharpenEnabled) { + + // dispatch RCAS + const int32_t threadGroupWorkRegionDimRCAS = 16; + const int32_t dispatchX = (context->contextDescription.displaySize.width + (threadGroupWorkRegionDimRCAS - 1)) / threadGroupWorkRegionDimRCAS; + const int32_t dispatchY = (context->contextDescription.displaySize.height + (threadGroupWorkRegionDimRCAS - 1)) / threadGroupWorkRegionDimRCAS; + scheduleDispatch(context, params, &context->pipelineRCAS, dispatchX, dispatchY); + } + + context->resourceFrameIndex = (context->resourceFrameIndex + 1) % FSR2_MAX_QUEUED_FRAMES; + + // Fsr2MaxQueuedFrames must be an even number. + FFX_STATIC_ASSERT((FSR2_MAX_QUEUED_FRAMES & 1) == 0); + + context->contextDescription.callbacks.fpExecuteGpuJobs(&context->contextDescription.callbacks, commandList); + + // release dynamic resources + context->contextDescription.callbacks.fpUnregisterResources(&context->contextDescription.callbacks); + + return FFX_OK; +} + +FfxErrorCode ffxFsr2ContextCreate(FfxFsr2Context* context, const FfxFsr2ContextDescription* contextDescription) +{ + // zero context memory + memset(context, 0, sizeof(FfxFsr2Context)); + + // check pointers are valid. + FFX_RETURN_ON_ERROR( + context, + FFX_ERROR_INVALID_POINTER); + FFX_RETURN_ON_ERROR( + contextDescription, + FFX_ERROR_INVALID_POINTER); + + // validate that all callbacks are set for the interface + FFX_RETURN_ON_ERROR(contextDescription->callbacks.fpGetDeviceCapabilities, FFX_ERROR_INCOMPLETE_INTERFACE); + FFX_RETURN_ON_ERROR(contextDescription->callbacks.fpCreateBackendContext, FFX_ERROR_INCOMPLETE_INTERFACE); + FFX_RETURN_ON_ERROR(contextDescription->callbacks.fpDestroyBackendContext, FFX_ERROR_INCOMPLETE_INTERFACE); + + // if a scratch buffer is declared, then we must have a size + if (contextDescription->callbacks.scratchBuffer) { + + FFX_RETURN_ON_ERROR(contextDescription->callbacks.scratchBufferSize, FFX_ERROR_INCOMPLETE_INTERFACE); + } + + // ensure the context is large enough for the internal context. + FFX_STATIC_ASSERT(sizeof(FfxFsr2Context) >= sizeof(FfxFsr2Context_Private)); + + // create the context. + FfxFsr2Context_Private* contextPrivate = (FfxFsr2Context_Private*)(context); + const FfxErrorCode errorCode = fsr2Create(contextPrivate, contextDescription); + + return errorCode; +} + +FfxErrorCode ffxFsr2ContextDestroy(FfxFsr2Context* context) +{ + FFX_RETURN_ON_ERROR( + context, + FFX_ERROR_INVALID_POINTER); + + // destroy the context. + FfxFsr2Context_Private* contextPrivate = (FfxFsr2Context_Private*)(context); + const FfxErrorCode errorCode = fsr2Release(contextPrivate); + return errorCode; +} + +FfxErrorCode ffxFsr2ContextDispatch(FfxFsr2Context* context, const FfxFsr2DispatchDescription* dispatchParams) +{ + FFX_RETURN_ON_ERROR( + context, + FFX_ERROR_INVALID_POINTER); + FFX_RETURN_ON_ERROR( + dispatchParams, + FFX_ERROR_INVALID_POINTER); + + FfxFsr2Context_Private* contextPrivate = (FfxFsr2Context_Private*)(context); + + // validate that renderSize is within the maximum. + FFX_RETURN_ON_ERROR( + dispatchParams->renderSize.width <= contextPrivate->contextDescription.maxRenderSize.width, + FFX_ERROR_OUT_OF_RANGE); + FFX_RETURN_ON_ERROR( + dispatchParams->renderSize.height <= contextPrivate->contextDescription.maxRenderSize.height, + FFX_ERROR_OUT_OF_RANGE); + FFX_RETURN_ON_ERROR( + contextPrivate->device, + FFX_ERROR_NULL_DEVICE); + + // dispatch the FSR2 passes. + const FfxErrorCode errorCode = fsr2Dispatch(contextPrivate, dispatchParams); + return errorCode; +} + +float ffxFsr2GetUpscaleRatioFromQualityMode(FfxFsr2QualityMode qualityMode) +{ + switch (qualityMode) { + + case FFX_FSR2_QUALITY_MODE_QUALITY: + return 1.5f; + case FFX_FSR2_QUALITY_MODE_BALANCED: + return 1.7f; + case FFX_FSR2_QUALITY_MODE_PERFORMANCE: + return 2.0f; + case FFX_FSR2_QUALITY_MODE_ULTRA_PERFORMANCE: + return 3.0f; + default: + return 0.0f; + } +} + +FfxErrorCode ffxFsr2GetRenderResolutionFromQualityMode( + uint32_t* renderWidth, + uint32_t* renderHeight, + uint32_t displayWidth, + uint32_t displayHeight, + FfxFsr2QualityMode qualityMode) +{ + FFX_RETURN_ON_ERROR( + renderWidth, + FFX_ERROR_INVALID_POINTER); + FFX_RETURN_ON_ERROR( + renderHeight, + FFX_ERROR_INVALID_POINTER); + FFX_RETURN_ON_ERROR( + FFX_FSR2_QUALITY_MODE_QUALITY <= qualityMode && qualityMode <= FFX_FSR2_QUALITY_MODE_ULTRA_PERFORMANCE, + FFX_ERROR_INVALID_ENUM); + + // scale by the predefined ratios in each dimension. + const float ratio = ffxFsr2GetUpscaleRatioFromQualityMode(qualityMode); + const uint32_t scaledDisplayWidth = (uint32_t)((float)displayWidth / ratio); + const uint32_t scaledDisplayHeight = (uint32_t)((float)displayHeight / ratio); + *renderWidth = scaledDisplayWidth; + *renderHeight = scaledDisplayHeight; + + return FFX_OK; +} + +FfxErrorCode ffxFsr2ContextEnqueueRefreshPipelineRequest(FfxFsr2Context* context) +{ + FFX_RETURN_ON_ERROR( + context, + FFX_ERROR_INVALID_POINTER); + + FfxFsr2Context_Private* contextPrivate = (FfxFsr2Context_Private*)context; + contextPrivate->refreshPipelineStates = true; + + return FFX_OK; +} + +int32_t ffxFsr2GetJitterPhaseCount(int32_t renderWidth, int32_t displayWidth) +{ + const float basePhaseCount = 8.0f; + const int32_t jitterPhaseCount = int32_t(basePhaseCount * pow((float(displayWidth) / renderWidth), 2.0f)); + return jitterPhaseCount; +} + +FfxErrorCode ffxFsr2GetJitterOffset(float* outX, float* outY, int32_t index, int32_t phaseCount) +{ + FFX_RETURN_ON_ERROR( + outX, + FFX_ERROR_INVALID_POINTER); + FFX_RETURN_ON_ERROR( + outY, + FFX_ERROR_INVALID_POINTER); + FFX_RETURN_ON_ERROR( + phaseCount > 0, + FFX_ERROR_INVALID_ARGUMENT); + + const float x = halton((index % phaseCount) + 1, 2) - 0.5f; + const float y = halton((index % phaseCount) + 1, 3) - 0.5f; + + *outX = x; + *outY = y; + return FFX_OK; +} + +FFX_API bool ffxFsr2ResourceIsNull(FfxResource resource) +{ + return resource.resource == NULL; +} + +FfxErrorCode ffxFsr2ContextGenerateReactiveMask(FfxFsr2Context* context, const FfxFsr2GenerateReactiveDescription* params) +{ + FFX_RETURN_ON_ERROR( + context, + FFX_ERROR_INVALID_POINTER); + FFX_RETURN_ON_ERROR( + params, + FFX_ERROR_INVALID_POINTER); + FFX_RETURN_ON_ERROR( + params->commandList, + FFX_ERROR_INVALID_POINTER); + + FfxFsr2Context_Private* contextPrivate = (FfxFsr2Context_Private*)(context); + + FFX_RETURN_ON_ERROR( + contextPrivate->device, + FFX_ERROR_NULL_DEVICE); + + if (contextPrivate->refreshPipelineStates) { + + createPipelineStates(contextPrivate); + contextPrivate->refreshPipelineStates = false; + } + + // take a short cut to the command list + FfxCommandList commandList = params->commandList; + + FfxPipelineState* pipeline = &contextPrivate->pipelineGenerateReactive; + + const int32_t threadGroupWorkRegionDim = 8; + const int32_t dispatchSrcX = (params->renderSize.width + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + const int32_t dispatchSrcY = (params->renderSize.height + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + + // save internal reactive resource + FfxResourceInternal internalReactive = contextPrivate->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_AUTOREACTIVE]; + + FfxComputeJobDescription jobDescriptor = {}; + contextPrivate->contextDescription.callbacks.fpRegisterResource(&contextPrivate->contextDescription.callbacks, ¶ms->colorOpaqueOnly, &contextPrivate->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_OPAQUE_ONLY]); + contextPrivate->contextDescription.callbacks.fpRegisterResource(&contextPrivate->contextDescription.callbacks, ¶ms->colorPreUpscale, &contextPrivate->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_COLOR]); + contextPrivate->contextDescription.callbacks.fpRegisterResource(&contextPrivate->contextDescription.callbacks, ¶ms->outReactive, &contextPrivate->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_AUTOREACTIVE]); + + jobDescriptor.uavs[0] = contextPrivate->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_AUTOREACTIVE]; + + wcscpy_s(jobDescriptor.srvNames[0], pipeline->srvResourceBindings[0].name); + wcscpy_s(jobDescriptor.srvNames[1], pipeline->srvResourceBindings[1].name); + wcscpy_s(jobDescriptor.uavNames[0], pipeline->uavResourceBindings[0].name); + + jobDescriptor.dimensions[0] = dispatchSrcX; + jobDescriptor.dimensions[1] = dispatchSrcY; + jobDescriptor.dimensions[2] = 1; + jobDescriptor.pipeline = *pipeline; + + for (uint32_t currentShaderResourceViewIndex = 0; currentShaderResourceViewIndex < pipeline->srvCount; ++currentShaderResourceViewIndex) { + + const uint32_t currentResourceId = pipeline->srvResourceBindings[currentShaderResourceViewIndex].resourceIdentifier; + const FfxResourceInternal currentResource = contextPrivate->srvResources[currentResourceId]; + jobDescriptor.srvs[currentShaderResourceViewIndex] = currentResource; + wcscpy_s(jobDescriptor.srvNames[currentShaderResourceViewIndex], pipeline->srvResourceBindings[currentShaderResourceViewIndex].name); + } + + Fsr2GenerateReactiveConstants constants = {}; + constants.scale = params->scale; + constants.threshold = params->cutoffThreshold; + constants.binaryValue = params->binaryValue; + constants.flags = params->flags; + + jobDescriptor.cbs[0].uint32Size = sizeof(constants); + memcpy(&jobDescriptor.cbs[0].data, &constants, sizeof(constants)); + wcscpy_s(jobDescriptor.cbNames[0], pipeline->cbResourceBindings[0].name); + + FfxGpuJobDescription dispatchJob = { FFX_GPU_JOB_COMPUTE }; + dispatchJob.computeJobDescriptor = jobDescriptor; + + contextPrivate->contextDescription.callbacks.fpScheduleGpuJob(&contextPrivate->contextDescription.callbacks, &dispatchJob); + + contextPrivate->contextDescription.callbacks.fpExecuteGpuJobs(&contextPrivate->contextDescription.callbacks, commandList); + + // restore internal reactive + contextPrivate->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_AUTOREACTIVE] = internalReactive; + + return FFX_OK; +} + +static FfxErrorCode generateReactiveMaskInternal(FfxFsr2Context_Private* contextPrivate, const FfxFsr2DispatchDescription* params) +{ + if (contextPrivate->refreshPipelineStates) { + + createPipelineStates(contextPrivate); + contextPrivate->refreshPipelineStates = false; + } + + // take a short cut to the command list + FfxCommandList commandList = params->commandList; + + FfxPipelineState* pipeline = &contextPrivate->pipelineTcrAutogenerate; + + const int32_t threadGroupWorkRegionDim = 8; + const int32_t dispatchSrcX = (params->renderSize.width + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + const int32_t dispatchSrcY = (params->renderSize.height + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + + FfxComputeJobDescription jobDescriptor = {}; + contextPrivate->contextDescription.callbacks.fpRegisterResource(&contextPrivate->contextDescription.callbacks, ¶ms->colorOpaqueOnly, &contextPrivate->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_OPAQUE_ONLY]); + contextPrivate->contextDescription.callbacks.fpRegisterResource(&contextPrivate->contextDescription.callbacks, ¶ms->color, &contextPrivate->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_COLOR]); + + jobDescriptor.uavs[0] = contextPrivate->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_AUTOREACTIVE]; + jobDescriptor.uavs[1] = contextPrivate->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_AUTOCOMPOSITION]; + jobDescriptor.uavs[2] = contextPrivate->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR]; + jobDescriptor.uavs[3] = contextPrivate->uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR]; + + wcscpy_s(jobDescriptor.uavNames[0], pipeline->uavResourceBindings[0].name); + wcscpy_s(jobDescriptor.uavNames[1], pipeline->uavResourceBindings[1].name); + wcscpy_s(jobDescriptor.uavNames[2], pipeline->uavResourceBindings[2].name); + wcscpy_s(jobDescriptor.uavNames[3], pipeline->uavResourceBindings[3].name); + + jobDescriptor.dimensions[0] = dispatchSrcX; + jobDescriptor.dimensions[1] = dispatchSrcY; + jobDescriptor.dimensions[2] = 1; + jobDescriptor.pipeline = *pipeline; + + for (uint32_t currentShaderResourceViewIndex = 0; currentShaderResourceViewIndex < pipeline->srvCount; ++currentShaderResourceViewIndex) { + + const uint32_t currentResourceId = pipeline->srvResourceBindings[currentShaderResourceViewIndex].resourceIdentifier; + const FfxResourceInternal currentResource = contextPrivate->srvResources[currentResourceId]; + jobDescriptor.srvs[currentShaderResourceViewIndex] = currentResource; + wcscpy_s(jobDescriptor.srvNames[currentShaderResourceViewIndex], pipeline->srvResourceBindings[currentShaderResourceViewIndex].name); + } + + for (uint32_t currentRootConstantIndex = 0; currentRootConstantIndex < pipeline->constCount; ++currentRootConstantIndex) { + wcscpy_s(jobDescriptor.cbNames[currentRootConstantIndex], pipeline->cbResourceBindings[currentRootConstantIndex].name); + jobDescriptor.cbs[currentRootConstantIndex] = globalFsr2ConstantBuffers[pipeline->cbResourceBindings[currentRootConstantIndex].resourceIdentifier]; + jobDescriptor.cbSlotIndex[currentRootConstantIndex] = pipeline->cbResourceBindings[currentRootConstantIndex].slotIndex; + } + + FfxGpuJobDescription dispatchJob = { FFX_GPU_JOB_COMPUTE }; + dispatchJob.computeJobDescriptor = jobDescriptor; + + contextPrivate->contextDescription.callbacks.fpScheduleGpuJob(&contextPrivate->contextDescription.callbacks, &dispatchJob); + + return FFX_OK; +} diff --git a/thirdparty/amd-fsr2/ffx_fsr2.h b/thirdparty/amd-fsr2/ffx_fsr2.h new file mode 100644 index 00000000000..7df3773cccb --- /dev/null +++ b/thirdparty/amd-fsr2/ffx_fsr2.h @@ -0,0 +1,458 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + +// @defgroup FSR2 + +#pragma once + +// Include the interface for the backend of the FSR2 API. +#include "ffx_fsr2_interface.h" + +/// FidelityFX Super Resolution 2 major version. +/// +/// @ingroup FSR2 +#define FFX_FSR2_VERSION_MAJOR (2) + +/// FidelityFX Super Resolution 2 minor version. +/// +/// @ingroup FSR2 +#define FFX_FSR2_VERSION_MINOR (2) + +/// FidelityFX Super Resolution 2 patch version. +/// +/// @ingroup FSR2 +#define FFX_FSR2_VERSION_PATCH (1) + +/// The size of the context specified in 32bit values. +/// +/// @ingroup FSR2 +#define FFX_FSR2_CONTEXT_SIZE (16536) + +#if defined(__cplusplus) +extern "C" { +#endif // #if defined(__cplusplus) + +/// An enumeration of all the quality modes supported by FidelityFX Super +/// Resolution 2 upscaling. +/// +/// In order to provide a consistent user experience across multiple +/// applications which implement FSR2. It is strongly recommended that the +/// following preset scaling factors are made available through your +/// application's user interface. +/// +/// If your application does not expose the notion of preset scaling factors +/// for upscaling algorithms (perhaps instead implementing a fixed ratio which +/// is immutable) or implementing a more dynamic scaling scheme (such as +/// dynamic resolution scaling), then there is no need to use these presets. +/// +/// Please note that FFX_FSR2_QUALITY_MODE_ULTRA_PERFORMANCE is +/// an optional mode which may introduce significant quality degradation in the +/// final image. As such it is recommended that you evaluate the final results +/// of using this scaling mode before deciding if you should include it in your +/// application. +/// +/// @ingroup FSR2 +typedef enum FfxFsr2QualityMode { + + FFX_FSR2_QUALITY_MODE_QUALITY = 1, ///< Perform upscaling with a per-dimension upscaling ratio of 1.5x. + FFX_FSR2_QUALITY_MODE_BALANCED = 2, ///< Perform upscaling with a per-dimension upscaling ratio of 1.7x. + FFX_FSR2_QUALITY_MODE_PERFORMANCE = 3, ///< Perform upscaling with a per-dimension upscaling ratio of 2.0x. + FFX_FSR2_QUALITY_MODE_ULTRA_PERFORMANCE = 4 ///< Perform upscaling with a per-dimension upscaling ratio of 3.0x. +} FfxFsr2QualityMode; + +/// An enumeration of bit flags used when creating a +/// FfxFsr2Context. See FfxFsr2ContextDescription. +/// +/// @ingroup FSR2 +typedef enum FfxFsr2InitializationFlagBits { + + FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE = (1<<0), ///< A bit indicating if the input color data provided is using a high-dynamic range. + FFX_FSR2_ENABLE_DISPLAY_RESOLUTION_MOTION_VECTORS = (1<<1), ///< A bit indicating if the motion vectors are rendered at display resolution. + FFX_FSR2_ENABLE_MOTION_VECTORS_JITTER_CANCELLATION = (1<<2), ///< A bit indicating that the motion vectors have the jittering pattern applied to them. + FFX_FSR2_ENABLE_DEPTH_INVERTED = (1<<3), ///< A bit indicating that the input depth buffer data provided is inverted [1..0]. + FFX_FSR2_ENABLE_DEPTH_INFINITE = (1<<4), ///< A bit indicating that the input depth buffer data provided is using an infinite far plane. + FFX_FSR2_ENABLE_AUTO_EXPOSURE = (1<<5), ///< A bit indicating if automatic exposure should be applied to input color data. + FFX_FSR2_ENABLE_DYNAMIC_RESOLUTION = (1<<6), ///< A bit indicating that the application uses dynamic resolution scaling. + FFX_FSR2_ENABLE_TEXTURE1D_USAGE = (1<<7), ///< A bit indicating that the backend should use 1D textures. + FFX_FSR2_ENABLE_DEBUG_CHECKING = (1<<8), ///< A bit indicating that the runtime should check some API values and report issues. +} FfxFsr2InitializationFlagBits; + +/// A structure encapsulating the parameters required to initialize FidelityFX +/// Super Resolution 2 upscaling. +/// +/// @ingroup FSR2 +typedef struct FfxFsr2ContextDescription { + + uint32_t flags; ///< A collection of FfxFsr2InitializationFlagBits. + FfxDimensions2D maxRenderSize; ///< The maximum size that rendering will be performed at. + FfxDimensions2D displaySize; ///< The size of the presentation resolution targeted by the upscaling process. + FfxFsr2Interface callbacks; ///< A set of pointers to the backend implementation for FSR 2.0. + FfxDevice device; ///< The abstracted device which is passed to some callback functions. + + FfxFsr2Message fpMessage; ///< A pointer to a function that can recieve messages from the runtime. +} FfxFsr2ContextDescription; + +/// A structure encapsulating the parameters for dispatching the various passes +/// of FidelityFX Super Resolution 2. +/// +/// @ingroup FSR2 +typedef struct FfxFsr2DispatchDescription { + + FfxCommandList commandList; ///< The FfxCommandList to record FSR2 rendering commands into. + FfxResource color; ///< A FfxResource containing the color buffer for the current frame (at render resolution). + FfxResource depth; ///< A FfxResource containing 32bit depth values for the current frame (at render resolution). + FfxResource motionVectors; ///< A FfxResource containing 2-dimensional motion vectors (at render resolution if FFX_FSR2_ENABLE_DISPLAY_RESOLUTION_MOTION_VECTORS is not set). + FfxResource exposure; ///< A optional FfxResource containing a 1x1 exposure value. + FfxResource reactive; ///< A optional FfxResource containing alpha value of reactive objects in the scene. + FfxResource transparencyAndComposition; ///< A optional FfxResource containing alpha value of special objects in the scene. + FfxResource output; ///< A FfxResource containing the output color buffer for the current frame (at presentation resolution). + FfxFloatCoords2D jitterOffset; ///< The subpixel jitter offset applied to the camera. + FfxFloatCoords2D motionVectorScale; ///< The scale factor to apply to motion vectors. + FfxDimensions2D renderSize; ///< The resolution that was used for rendering the input resources. + bool enableSharpening; ///< Enable an additional sharpening pass. + float sharpness; ///< The sharpness value between 0 and 1, where 0 is no additional sharpness and 1 is maximum additional sharpness. + float frameTimeDelta; ///< The time elapsed since the last frame (expressed in milliseconds). + float preExposure; ///< The pre exposure value (must be > 0.0f) + bool reset; ///< A boolean value which when set to true, indicates the camera has moved discontinuously. + float cameraNear; ///< The distance to the near plane of the camera. + float cameraFar; ///< The distance to the far plane of the camera. + float cameraFovAngleVertical; ///< The camera angle field of view in the vertical direction (expressed in radians). + float viewSpaceToMetersFactor; ///< The scale factor to convert view space units to meters + + // EXPERIMENTAL reactive mask generation parameters + bool enableAutoReactive; ///< A boolean value to indicate internal reactive autogeneration should be used + FfxResource colorOpaqueOnly; ///< A FfxResource containing the opaque only color buffer for the current frame (at render resolution). + float autoTcThreshold; ///< Cutoff value for TC + float autoTcScale; ///< A value to scale the transparency and composition mask + float autoReactiveScale; ///< A value to scale the reactive mask + float autoReactiveMax; ///< A value to clamp the reactive mask + + // -- GODOT start -- + float reprojectionMatrix[16]; ///< The matrix used for reprojecting pixels with invalid motion vectors by using the depth. + // -- GODOT end -- + +} FfxFsr2DispatchDescription; + +/// A structure encapsulating the parameters for automatic generation of a reactive mask +/// +/// @ingroup FSR2 +typedef struct FfxFsr2GenerateReactiveDescription { + + FfxCommandList commandList; ///< The FfxCommandList to record FSR2 rendering commands into. + FfxResource colorOpaqueOnly; ///< A FfxResource containing the opaque only color buffer for the current frame (at render resolution). + FfxResource colorPreUpscale; ///< A FfxResource containing the opaque+translucent color buffer for the current frame (at render resolution). + FfxResource outReactive; ///< A FfxResource containing the surface to generate the reactive mask into. + FfxDimensions2D renderSize; ///< The resolution that was used for rendering the input resources. + float scale; ///< A value to scale the output + float cutoffThreshold; ///< A threshold value to generate a binary reactive mask + float binaryValue; ///< A value to set for the binary reactive mask + uint32_t flags; ///< Flags to determine how to generate the reactive mask +} FfxFsr2GenerateReactiveDescription; + +/// A structure encapsulating the FidelityFX Super Resolution 2 context. +/// +/// This sets up an object which contains all persistent internal data and +/// resources that are required by FSR2. +/// +/// The FfxFsr2Context object should have a lifetime matching +/// your use of FSR2. Before destroying the FSR2 context care should be taken +/// to ensure the GPU is not accessing the resources created or used by FSR2. +/// It is therefore recommended that the GPU is idle before destroying the +/// FSR2 context. +/// +/// @ingroup FSR2 +typedef struct FfxFsr2Context { + + uint32_t data[FFX_FSR2_CONTEXT_SIZE]; ///< An opaque set of uint32_t which contain the data for the context. +} FfxFsr2Context; + +/// Create a FidelityFX Super Resolution 2 context from the parameters +/// programmed to the FfxFsr2CreateParams structure. +/// +/// The context structure is the main object used to interact with the FSR2 +/// API, and is responsible for the management of the internal resources used +/// by the FSR2 algorithm. When this API is called, multiple calls will be +/// made via the pointers contained in the callbacks structure. +/// These callbacks will attempt to retreive the device capabilities, and +/// create the internal resources, and pipelines required by FSR2's +/// frame-to-frame function. Depending on the precise configuration used when +/// creating the FfxFsr2Context a different set of resources and +/// pipelines might be requested via the callback functions. +/// +/// The flags included in the flags field of +/// FfxFsr2Context how match the configuration of your +/// application as well as the intended use of FSR2. It is important that these +/// flags are set correctly (as well as a correct programmed +/// FfxFsr2DispatchDescription) to ensure correct operation. It is +/// recommended to consult the overview documentation for further details on +/// how FSR2 should be integerated into an application. +/// +/// When the FfxFsr2Context is created, you should use the +/// ffxFsr2ContextDispatch function each frame where FSR2 +/// upscaling should be applied. See the documentation of +/// ffxFsr2ContextDispatch for more details. +/// +/// The FfxFsr2Context should be destroyed when use of it is +/// completed, typically when an application is unloaded or FSR2 upscaling is +/// disabled by a user. To destroy the FSR2 context you should call +/// ffxFsr2ContextDestroy. +/// +/// @param [out] context A pointer to a FfxFsr2Context structure to populate. +/// @param [in] contextDescription A pointer to a FfxFsr2ContextDescription structure. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// FFX_ERROR_CODE_NULL_POINTER The operation failed because either context or contextDescription was NULL. +/// @retval +/// FFX_ERROR_INCOMPLETE_INTERFACE The operation failed because the FfxFsr2ContextDescription.callbacks was not fully specified. +/// @retval +/// FFX_ERROR_BACKEND_API_ERROR The operation failed because of an error returned from the backend. +/// +/// @ingroup FSR2 +FFX_API FfxErrorCode ffxFsr2ContextCreate(FfxFsr2Context* context, const FfxFsr2ContextDescription* contextDescription); + +/// Dispatch the various passes that constitute FidelityFX Super Resolution 2. +/// +/// FSR2 is a composite effect, meaning that it is compromised of multiple +/// constituent passes (implemented as one or more clears, copies and compute +/// dispatches). The ffxFsr2ContextDispatch function is the +/// function which (via the use of the functions contained in the +/// callbacks field of the FfxFsr2Context +/// structure) utlimately generates the sequence of graphics API calls required +/// each frame. +/// +/// As with the creation of the FfxFsr2Context correctly +/// programming the FfxFsr2DispatchDescription is key to ensuring +/// the correct operation of FSR2. It is particularly important to ensure that +/// camera jitter is correctly applied to your application's projection matrix +/// (or camera origin for raytraced applications). FSR2 provides the +/// ffxFsr2GetJitterPhaseCount and +/// ffxFsr2GetJitterOffset entry points to help applications +/// correctly compute the camera jitter. Whatever jitter pattern is used by the +/// application it should be correctly programmed to the +/// jitterOffset field of the dispatchDescription +/// structure. For more guidance on camera jitter please consult the +/// documentation for ffxFsr2GetJitterOffset as well as the +/// accompanying overview documentation for FSR2. +/// +/// @param [in] context A pointer to a FfxFsr2Context structure. +/// @param [in] dispatchDescription A pointer to a FfxFsr2DispatchDescription structure. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// FFX_ERROR_CODE_NULL_POINTER The operation failed because either context or dispatchDescription was NULL. +/// @retval +/// FFX_ERROR_OUT_OF_RANGE The operation failed because dispatchDescription.renderSize was larger than the maximum render resolution. +/// @retval +/// FFX_ERROR_NULL_DEVICE The operation failed because the device inside the context was NULL. +/// @retval +/// FFX_ERROR_BACKEND_API_ERROR The operation failed because of an error returned from the backend. +/// +/// @ingroup FSR2 +FFX_API FfxErrorCode ffxFsr2ContextDispatch(FfxFsr2Context* context, const FfxFsr2DispatchDescription* dispatchDescription); + +/// A helper function generate a Reactive mask from an opaque only texure and one containing translucent objects. +/// +/// @param [in] context A pointer to a FfxFsr2Context structure. +/// @param [in] params A pointer to a FfxFsr2GenerateReactiveDescription structure +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// +/// @ingroup FSR2 +FFX_API FfxErrorCode ffxFsr2ContextGenerateReactiveMask(FfxFsr2Context* context, const FfxFsr2GenerateReactiveDescription* params); + +/// Destroy the FidelityFX Super Resolution context. +/// +/// @param [out] context A pointer to a FfxFsr2Context structure to destroy. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// FFX_ERROR_CODE_NULL_POINTER The operation failed because either context was NULL. +/// +/// @ingroup FSR2 +FFX_API FfxErrorCode ffxFsr2ContextDestroy(FfxFsr2Context* context); + +/// Get the upscale ratio from the quality mode. +/// +/// The following table enumerates the mapping of the quality modes to +/// per-dimension scaling ratios. +/// +/// Quality preset | Scale factor +/// ----------------------------------------------------- | ------------- +/// FFX_FSR2_QUALITY_MODE_QUALITY | 1.5x +/// FFX_FSR2_QUALITY_MODE_BALANCED | 1.7x +/// FFX_FSR2_QUALITY_MODE_PERFORMANCE | 2.0x +/// FFX_FSR2_QUALITY_MODE_ULTRA_PERFORMANCE | 3.0x +/// +/// Passing an invalid qualityMode will return 0.0f. +/// +/// @param [in] qualityMode The quality mode preset. +/// +/// @returns +/// The upscaling the per-dimension upscaling ratio for +/// qualityMode according to the table above. +/// +/// @ingroup FSR2 +FFX_API float ffxFsr2GetUpscaleRatioFromQualityMode(FfxFsr2QualityMode qualityMode); + +/// A helper function to calculate the rendering resolution from a target +/// resolution and desired quality level. +/// +/// This function applies the scaling factor returned by +/// ffxFsr2GetUpscaleRatioFromQualityMode to each dimension. +/// +/// @param [out] renderWidth A pointer to a uint32_t which will hold the calculated render resolution width. +/// @param [out] renderHeight A pointer to a uint32_t which will hold the calculated render resolution height. +/// @param [in] displayWidth The target display resolution width. +/// @param [in] displayHeight The target display resolution height. +/// @param [in] qualityMode The desired quality mode for FSR 2 upscaling. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// FFX_ERROR_INVALID_POINTER Either renderWidth or renderHeight was NULL. +/// @retval +/// FFX_ERROR_INVALID_ENUM An invalid quality mode was specified. +/// +/// @ingroup FSR2 +FFX_API FfxErrorCode ffxFsr2GetRenderResolutionFromQualityMode( + uint32_t* renderWidth, + uint32_t* renderHeight, + uint32_t displayWidth, + uint32_t displayHeight, + FfxFsr2QualityMode qualityMode); + +/// A helper function to calculate the jitter phase count from display +/// resolution. +/// +/// For more detailed information about the application of camera jitter to +/// your application's rendering please refer to the +/// ffxFsr2GetJitterOffset function. +/// +/// The table below shows the jitter phase count which this function +/// would return for each of the quality presets. +/// +/// Quality preset | Scale factor | Phase count +/// ----------------------------------------------------- | ------------- | --------------- +/// FFX_FSR2_QUALITY_MODE_QUALITY | 1.5x | 18 +/// FFX_FSR2_QUALITY_MODE_BALANCED | 1.7x | 23 +/// FFX_FSR2_QUALITY_MODE_PERFORMANCE | 2.0x | 32 +/// FFX_FSR2_QUALITY_MODE_ULTRA_PERFORMANCE | 3.0x | 72 +/// Custom | [1..n]x | ceil(8*n^2) +/// +/// @param [in] renderWidth The render resolution width. +/// @param [in] displayWidth The display resolution width. +/// +/// @returns +/// The jitter phase count for the scaling factor between renderWidth and displayWidth. +/// +/// @ingroup FSR2 +FFX_API int32_t ffxFsr2GetJitterPhaseCount(int32_t renderWidth, int32_t displayWidth); + +/// A helper function to calculate the subpixel jitter offset. +/// +/// FSR2 relies on the application to apply sub-pixel jittering while rendering. +/// This is typically included in the projection matrix of the camera. To make +/// the application of camera jitter simple, the FSR2 API provides a small set +/// of utility function which computes the sub-pixel jitter offset for a +/// particular frame within a sequence of separate jitter offsets. To begin, the +/// index within the jitter phase must be computed. To calculate the +/// sequence's length, you can call the ffxFsr2GetJitterPhaseCount +/// function. The index should be a value which is incremented each frame modulo +/// the length of the sequence computed by ffxFsr2GetJitterPhaseCount. +/// The index within the jitter phase is passed to +/// ffxFsr2GetJitterOffset via the index parameter. +/// +/// This function uses a Halton(2,3) sequence to compute the jitter offset. +/// The ultimate index used for the sequence is index % +/// phaseCount. +/// +/// It is important to understand that the values returned from the +/// ffxFsr2GetJitterOffset function are in unit pixel space, and +/// in order to composite this correctly into a projection matrix we must +/// convert them into projection offsets. This is done as per the pseudo code +/// listing which is shown below. +/// +/// const int32_t jitterPhaseCount = ffxFsr2GetJitterPhaseCount(renderWidth, displayWidth); +/// +/// float jitterX = 0; +/// float jitterY = 0; +/// ffxFsr2GetJitterOffset(&jitterX, &jitterY, index, jitterPhaseCount); +/// +/// const float jitterX = 2.0f * jitterX / (float)renderWidth; +/// const float jitterY = -2.0f * jitterY / (float)renderHeight; +/// const Matrix4 jitterTranslationMatrix = translateMatrix(Matrix3::identity, Vector3(jitterX, jitterY, 0)); +/// const Matrix4 jitteredProjectionMatrix = jitterTranslationMatrix * projectionMatrix; +/// +/// Jitter should be applied to all rendering. This includes opaque, alpha +/// transparent, and raytraced objects. For rasterized objects, the sub-pixel +/// jittering values calculated by the iffxFsr2GetJitterOffset +/// function can be applied to the camera projection matrix which is ultimately +/// used to perform transformations during vertex shading. For raytraced +/// rendering, the sub-pixel jitter should be applied to the ray's origin, +/// often the camera's position. +/// +/// Whether you elect to use the ffxFsr2GetJitterOffset function +/// or your own sequence generator, you must program the +/// jitterOffset field of the +/// FfxFsr2DispatchParameters structure in order to inform FSR2 +/// of the jitter offset that has been applied in order to render each frame. +/// +/// If not using the recommended ffxFsr2GetJitterOffset function, +/// care should be taken that your jitter sequence never generates a null vector; +/// that is value of 0 in both the X and Y dimensions. +/// +/// @param [out] outX A pointer to a float which will contain the subpixel jitter offset for the x dimension. +/// @param [out] outY A pointer to a float which will contain the subpixel jitter offset for the y dimension. +/// @param [in] index The index within the jitter sequence. +/// @param [in] phaseCount The length of jitter phase. See ffxFsr2GetJitterPhaseCount. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// FFX_ERROR_INVALID_POINTER Either outX or outY was NULL. +/// @retval +/// FFX_ERROR_INVALID_ARGUMENT Argument phaseCount must be greater than 0. +/// +/// @ingroup FSR2 +FFX_API FfxErrorCode ffxFsr2GetJitterOffset(float* outX, float* outY, int32_t index, int32_t phaseCount); + +/// A helper function to check if a resource is +/// FFX_FSR2_RESOURCE_IDENTIFIER_NULL. +/// +/// @param [in] resource A FfxResource. +/// +/// @returns +/// true The resource was not FFX_FSR2_RESOURCE_IDENTIFIER_NULL. +/// @returns +/// false The resource was FFX_FSR2_RESOURCE_IDENTIFIER_NULL. +/// +/// @ingroup FSR2 +FFX_API bool ffxFsr2ResourceIsNull(FfxResource resource); + +#if defined(__cplusplus) +} +#endif // #if defined(__cplusplus) diff --git a/thirdparty/amd-fsr2/ffx_fsr2_interface.h b/thirdparty/amd-fsr2/ffx_fsr2_interface.h new file mode 100644 index 00000000000..b6be9760a71 --- /dev/null +++ b/thirdparty/amd-fsr2/ffx_fsr2_interface.h @@ -0,0 +1,395 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "ffx_assert.h" +#include "ffx_types.h" +#include "ffx_error.h" + +// Include the FSR2 resources defined in the HLSL code. This shared here to avoid getting out of sync. +#define FFX_CPU +#include "shaders/ffx_fsr2_resources.h" +#include "shaders/ffx_fsr2_common.h" + +#if defined(__cplusplus) +extern "C" { +#endif // #if defined(__cplusplus) + +FFX_FORWARD_DECLARE(FfxFsr2Interface); + +/// An enumeration of all the passes which constitute the FSR2 algorithm. +/// +/// FSR2 is implemented as a composite of several compute passes each +/// computing a key part of the final result. Each call to the +/// FfxFsr2ScheduleGpuJobFunc callback function will +/// correspond to a single pass included in FfxFsr2Pass. For a +/// more comprehensive description of each pass, please refer to the FSR2 +/// reference documentation. +/// +/// Please note in some cases e.g.: FFX_FSR2_PASS_ACCUMULATE +/// and FFX_FSR2_PASS_ACCUMULATE_SHARPEN either one pass or the +/// other will be used (they are mutually exclusive). The choice of which will +/// depend on the way the FfxFsr2Context is created and the +/// precise contents of FfxFsr2DispatchParamters each time a call +/// is made to ffxFsr2ContextDispatch. +/// +/// @ingroup FSR2 +typedef enum FfxFsr2Pass { + + FFX_FSR2_PASS_DEPTH_CLIP = 0, ///< A pass which performs depth clipping. + FFX_FSR2_PASS_RECONSTRUCT_PREVIOUS_DEPTH = 1, ///< A pass which performs reconstruction of previous frame's depth. + FFX_FSR2_PASS_LOCK = 2, ///< A pass which calculates pixel locks. + FFX_FSR2_PASS_ACCUMULATE = 3, ///< A pass which performs upscaling. + FFX_FSR2_PASS_ACCUMULATE_SHARPEN = 4, ///< A pass which performs upscaling when sharpening is used. + FFX_FSR2_PASS_RCAS = 5, ///< A pass which performs sharpening. + FFX_FSR2_PASS_COMPUTE_LUMINANCE_PYRAMID = 6, ///< A pass which generates the luminance mipmap chain for the current frame. + FFX_FSR2_PASS_GENERATE_REACTIVE = 7, ///< An optional pass to generate a reactive mask + FFX_FSR2_PASS_TCR_AUTOGENERATE = 8, ///< An optional pass to generate a texture-and-composition and reactive masks + + FFX_FSR2_PASS_COUNT ///< The number of passes performed by FSR2. +} FfxFsr2Pass; + +typedef enum FfxFsr2MsgType { + FFX_FSR2_MESSAGE_TYPE_ERROR = 0, + FFX_FSR2_MESSAGE_TYPE_WARNING = 1, + FFX_FSR2_MESSAGE_TYPE_COUNT +} FfxFsr2MsgType; + +/// Create and initialize the backend context. +/// +/// The callback function sets up the backend context for rendering. +/// It will create or reference the device and create required internal data structures. +/// +/// @param [in] backendInterface A pointer to the backend interface. +/// @param [in] device The FfxDevice obtained by ffxGetDevice(DX12/VK/...). +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// Anything else The operation failed. +/// +/// @ingroup FSR2 +typedef FfxErrorCode (*FfxFsr2CreateBackendContextFunc)( + FfxFsr2Interface* backendInterface, + FfxDevice device); + +/// Get a list of capabilities of the device. +/// +/// When creating an FfxFsr2Context it is desirable for the FSR2 +/// core implementation to be aware of certain characteristics of the platform +/// that is being targetted. This is because some optimizations which FSR2 +/// attempts to perform are more effective on certain classes of hardware than +/// others, or are not supported by older hardware. In order to avoid cases +/// where optimizations actually have the effect of decreasing performance, or +/// reduce the breadth of support provided by FSR2, FSR2 queries the +/// capabilities of the device to make such decisions. +/// +/// For target platforms with fixed hardware support you need not implement +/// this callback function by querying the device, but instead may hardcore +/// what features are available on the platform. +/// +/// @param [in] backendInterface A pointer to the backend interface. +/// @param [out] outDeviceCapabilities The device capabilities structure to fill out. +/// @param [in] device The device to query for capabilities. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// Anything else The operation failed. +/// +/// @ingroup FSR2 +typedef FfxErrorCode(*FfxFsr2GetDeviceCapabilitiesFunc)( + FfxFsr2Interface* backendInterface, + FfxDeviceCapabilities* outDeviceCapabilities, + FfxDevice device); + +/// Destroy the backend context and dereference the device. +/// +/// This function is called when the FfxFsr2Context is destroyed. +/// +/// @param [in] backendInterface A pointer to the backend interface. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// Anything else The operation failed. +/// +/// @ingroup FSR2 +typedef FfxErrorCode(*FfxFsr2DestroyBackendContextFunc)( + FfxFsr2Interface* backendInterface); + +/// Create a resource. +/// +/// This callback is intended for the backend to create internal resources. +/// +/// Please note: It is also possible that the creation of resources might +/// itself cause additional resources to be created by simply calling the +/// FfxFsr2CreateResourceFunc function pointer again. This is +/// useful when handling the initial creation of resources which must be +/// initialized. The flow in such a case would be an initial call to create the +/// CPU-side resource, another to create the GPU-side resource, and then a call +/// to schedule a copy render job to move the data between the two. Typically +/// this type of function call flow is only seen during the creation of an +/// FfxFsr2Context. +/// +/// @param [in] backendInterface A pointer to the backend interface. +/// @param [in] createResourceDescription A pointer to a FfxCreateResourceDescription. +/// @param [out] outResource A pointer to a FfxResource object. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// Anything else The operation failed. +/// +/// @ingroup FSR2 +typedef FfxErrorCode (*FfxFsr2CreateResourceFunc)( + FfxFsr2Interface* backendInterface, + const FfxCreateResourceDescription* createResourceDescription, + FfxResourceInternal* outResource); + +/// Register a resource in the backend for the current frame. +/// +/// Since FSR2 and the backend are not aware how many different +/// resources will get passed to FSR2 over time, it's not safe +/// to register all resources simultaneously in the backend. +/// Also passed resources may not be valid after the dispatch call. +/// As a result it's safest to register them as FfxResourceInternal +/// and clear them at the end of the dispatch call. +/// +/// @param [in] backendInterface A pointer to the backend interface. +/// @param [in] inResource A pointer to a FfxResource. +/// @param [out] outResource A pointer to a FfxResourceInternal object. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// Anything else The operation failed. +/// +/// @ingroup FSR2 +typedef FfxErrorCode(*FfxFsr2RegisterResourceFunc)( + FfxFsr2Interface* backendInterface, + const FfxResource* inResource, + FfxResourceInternal* outResource); + +/// Unregister all temporary FfxResourceInternal from the backend. +/// +/// Unregister FfxResourceInternal referencing resources passed to +/// a function as a parameter. +/// +/// @param [in] backendInterface A pointer to the backend interface. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// Anything else The operation failed. +/// +/// @ingroup FSR2 +typedef FfxErrorCode(*FfxFsr2UnregisterResourcesFunc)( + FfxFsr2Interface* backendInterface); + +/// Retrieve a FfxResourceDescription matching a +/// FfxResource structure. +/// +/// @param [in] backendInterface A pointer to the backend interface. +/// @param [in] resource A pointer to a FfxResource object. +/// +/// @returns +/// A description of the resource. +/// +/// @ingroup FSR2 +typedef FfxResourceDescription (*FfxFsr2GetResourceDescriptionFunc)( + FfxFsr2Interface* backendInterface, + FfxResourceInternal resource); + +/// Destroy a resource +/// +/// This callback is intended for the backend to release an internal resource. +/// +/// @param [in] backendInterface A pointer to the backend interface. +/// @param [in] resource A pointer to a FfxResource object. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// Anything else The operation failed. +/// +/// @ingroup FSR2 +typedef FfxErrorCode (*FfxFsr2DestroyResourceFunc)( + FfxFsr2Interface* backendInterface, + FfxResourceInternal resource); + +/// Create a render pipeline. +/// +/// A rendering pipeline contains the shader as well as resource bindpoints +/// and samplers. +/// +/// @param [in] backendInterface A pointer to the backend interface. +/// @param [in] pass The identifier for the pass. +/// @param [in] pipelineDescription A pointer to a FfxPipelineDescription describing the pipeline to be created. +/// @param [out] outPipeline A pointer to a FfxPipelineState structure which should be populated. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// Anything else The operation failed. +/// +/// @ingroup FSR2 +typedef FfxErrorCode (*FfxFsr2CreatePipelineFunc)( + FfxFsr2Interface* backendInterface, + FfxFsr2Pass pass, + const FfxPipelineDescription* pipelineDescription, + FfxPipelineState* outPipeline); + +/// Destroy a render pipeline. +/// +/// @param [in] backendInterface A pointer to the backend interface. +/// @param [out] pipeline A pointer to a FfxPipelineState structure which should be released. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// Anything else The operation failed. +/// +/// @ingroup FSR2 +typedef FfxErrorCode (*FfxFsr2DestroyPipelineFunc)( + FfxFsr2Interface* backendInterface, + FfxPipelineState* pipeline); + +/// Schedule a render job to be executed on the next call of +/// FfxFsr2ExecuteGpuJobsFunc. +/// +/// Render jobs can perform one of three different tasks: clear, copy or +/// compute dispatches. +/// +/// @param [in] backendInterface A pointer to the backend interface. +/// @param [in] job A pointer to a FfxGpuJobDescription structure. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// Anything else The operation failed. +/// +/// @ingroup FSR2 +typedef FfxErrorCode (*FfxFsr2ScheduleGpuJobFunc)( + FfxFsr2Interface* backendInterface, + const FfxGpuJobDescription* job); + +/// Execute scheduled render jobs on the comandList provided. +/// +/// The recording of the graphics API commands should take place in this +/// callback function, the render jobs which were previously enqueued (via +/// callbacks made to FfxFsr2ScheduleGpuJobFunc) should be +/// processed in the order they were received. Advanced users might choose to +/// reorder the rendering jobs, but should do so with care to respect the +/// resource dependencies. +/// +/// Depending on the precise contents of FfxFsr2DispatchDescription a +/// different number of render jobs might have previously been enqueued (for +/// example if sharpening is toggled on and off). +/// +/// @param [in] backendInterface A pointer to the backend interface. +/// @param [in] commandList A pointer to a FfxCommandList structure. +/// +/// @retval +/// FFX_OK The operation completed successfully. +/// @retval +/// Anything else The operation failed. +/// +/// @ingroup FSR2 +typedef FfxErrorCode (*FfxFsr2ExecuteGpuJobsFunc)( + FfxFsr2Interface* backendInterface, + FfxCommandList commandList); + +/// Pass a string message +/// +/// Used for debug messages. +/// +/// @param [in] type The type of message. +/// @param [in] message A string message to pass. +/// +/// +/// @ingroup FSR2 +typedef void(*FfxFsr2Message)( + FfxFsr2MsgType type, + const wchar_t* message); + +/// A structure encapsulating the interface between the core implentation of +/// the FSR2 algorithm and any graphics API that it should ultimately call. +/// +/// This set of functions serves as an abstraction layer between FSR2 and the +/// API used to implement it. While FSR2 ships with backends for DirectX12 and +/// Vulkan, it is possible to implement your own backend for other platforms or +/// which sits ontop of your engine's own abstraction layer. For details on the +/// expectations of what each function should do you should refer the +/// description of the following function pointer types: +/// +/// FfxFsr2CreateDeviceFunc +/// FfxFsr2GetDeviceCapabilitiesFunc +/// FfxFsr2DestroyDeviceFunc +/// FfxFsr2CreateResourceFunc +/// FfxFsr2GetResourceDescriptionFunc +/// FfxFsr2DestroyResourceFunc +/// FfxFsr2CreatePipelineFunc +/// FfxFsr2DestroyPipelineFunc +/// FfxFsr2ScheduleGpuJobFunc +/// FfxFsr2ExecuteGpuJobsFunc +/// +/// Depending on the graphics API that is abstracted by the backend, it may be +/// required that the backend is to some extent stateful. To ensure that +/// applications retain full control to manage the memory used by FSR2, the +/// scratchBuffer and scratchBufferSize fields are +/// provided. A backend should provide a means of specifying how much scratch +/// memory is required for its internal implementation (e.g: via a function +/// or constant value). The application is that responsible for allocating that +/// memory and providing it when setting up the FSR2 backend. Backends provided +/// with FSR2 do not perform dynamic memory allocations, and instead +/// suballocate all memory from the scratch buffers provided. +/// +/// The scratchBuffer and scratchBufferSize fields +/// should be populated according to the requirements of each backend. For +/// example, if using the DirectX 12 backend you should call the +/// ffxFsr2GetScratchMemorySizeDX12 function. It is not required +/// that custom backend implementations use a scratch buffer. +/// +/// @ingroup FSR2 +typedef struct FfxFsr2Interface { + + FfxFsr2CreateBackendContextFunc fpCreateBackendContext; ///< A callback function to create and initialize the backend context. + FfxFsr2GetDeviceCapabilitiesFunc fpGetDeviceCapabilities; ///< A callback function to query device capabilites. + FfxFsr2DestroyBackendContextFunc fpDestroyBackendContext; ///< A callback function to destroy the backendcontext. This also dereferences the device. + FfxFsr2CreateResourceFunc fpCreateResource; ///< A callback function to create a resource. + FfxFsr2RegisterResourceFunc fpRegisterResource; ///< A callback function to register an external resource. + FfxFsr2UnregisterResourcesFunc fpUnregisterResources; ///< A callback function to unregister external resource. + FfxFsr2GetResourceDescriptionFunc fpGetResourceDescription; ///< A callback function to retrieve a resource description. + FfxFsr2DestroyResourceFunc fpDestroyResource; ///< A callback function to destroy a resource. + FfxFsr2CreatePipelineFunc fpCreatePipeline; ///< A callback function to create a render or compute pipeline. + FfxFsr2DestroyPipelineFunc fpDestroyPipeline; ///< A callback function to destroy a render or compute pipeline. + FfxFsr2ScheduleGpuJobFunc fpScheduleGpuJob; ///< A callback function to schedule a render job. + FfxFsr2ExecuteGpuJobsFunc fpExecuteGpuJobs; ///< A callback function to execute all queued render jobs. + + void* scratchBuffer; ///< A preallocated buffer for memory utilized internally by the backend. + size_t scratchBufferSize; ///< Size of the buffer pointed to by scratchBuffer. +} FfxFsr2Interface; + +#if defined(__cplusplus) +} +#endif // #if defined(__cplusplus) diff --git a/thirdparty/amd-fsr2/ffx_fsr2_maximum_bias.h b/thirdparty/amd-fsr2/ffx_fsr2_maximum_bias.h new file mode 100644 index 00000000000..5fdbd0cdcd0 --- /dev/null +++ b/thirdparty/amd-fsr2/ffx_fsr2_maximum_bias.h @@ -0,0 +1,46 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +// @internal + +#pragma once + +static const int FFX_FSR2_MAXIMUM_BIAS_TEXTURE_WIDTH = 16; +static const int FFX_FSR2_MAXIMUM_BIAS_TEXTURE_HEIGHT = 16; +static const float ffxFsr2MaximumBias[] = { + 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 1.876f, 1.809f, 1.772f, 1.753f, 1.748f, + 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 1.869f, 1.801f, 1.764f, 1.745f, 1.739f, + 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 1.976f, 1.841f, 1.774f, 1.737f, 1.716f, 1.71f, + 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 1.914f, 1.784f, 1.716f, 1.673f, 1.649f, 1.641f, + 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 1.793f, 1.676f, 1.604f, 1.562f, 1.54f, 1.533f, + 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 1.802f, 1.619f, 1.536f, 1.492f, 1.467f, 1.454f, 1.449f, + 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 1.812f, 1.575f, 1.496f, 1.456f, 1.432f, 1.416f, 1.408f, 1.405f, + 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 1.555f, 1.479f, 1.438f, 1.413f, 1.398f, 1.387f, 1.381f, 1.379f, + 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 1.812f, 1.555f, 1.474f, 1.43f, 1.404f, 1.387f, 1.376f, 1.368f, 1.363f, 1.362f, + 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 1.802f, 1.575f, 1.479f, 1.43f, 1.401f, 1.382f, 1.369f, 1.36f, 1.354f, 1.351f, 1.35f, + 2.0f, 2.0f, 1.976f, 1.914f, 1.793f, 1.619f, 1.496f, 1.438f, 1.404f, 1.382f, 1.367f, 1.357f, 1.349f, 1.344f, 1.341f, 1.34f, + 1.876f, 1.869f, 1.841f, 1.784f, 1.676f, 1.536f, 1.456f, 1.413f, 1.387f, 1.369f, 1.357f, 1.347f, 1.341f, 1.336f, 1.333f, 1.332f, + 1.809f, 1.801f, 1.774f, 1.716f, 1.604f, 1.492f, 1.432f, 1.398f, 1.376f, 1.36f, 1.349f, 1.341f, 1.335f, 1.33f, 1.328f, 1.327f, + 1.772f, 1.764f, 1.737f, 1.673f, 1.562f, 1.467f, 1.416f, 1.387f, 1.368f, 1.354f, 1.344f, 1.336f, 1.33f, 1.326f, 1.323f, 1.323f, + 1.753f, 1.745f, 1.716f, 1.649f, 1.54f, 1.454f, 1.408f, 1.381f, 1.363f, 1.351f, 1.341f, 1.333f, 1.328f, 1.323f, 1.321f, 1.32f, + 1.748f, 1.739f, 1.71f, 1.641f, 1.533f, 1.449f, 1.405f, 1.379f, 1.362f, 1.35f, 1.34f, 1.332f, 1.327f, 1.323f, 1.32f, 1.319f, + +}; diff --git a/thirdparty/amd-fsr2/ffx_fsr2_private.h b/thirdparty/amd-fsr2/ffx_fsr2_private.h new file mode 100644 index 00000000000..0face069b60 --- /dev/null +++ b/thirdparty/amd-fsr2/ffx_fsr2_private.h @@ -0,0 +1,86 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +// Constants for FSR2 DX12 dispatches. Must be kept in sync with cbFSR2 in ffx_fsr2_callbacks_hlsl.h +typedef struct Fsr2Constants { + + int32_t renderSize[2]; + int32_t maxRenderSize[2]; + int32_t displaySize[2]; + int32_t inputColorResourceDimensions[2]; + int32_t lumaMipDimensions[2]; + int32_t lumaMipLevelToUse; + int32_t frameIndex; + + float deviceToViewDepth[4]; + float jitterOffset[2]; + float motionVectorScale[2]; + float downscaleFactor[2]; + float motionVectorJitterCancellation[2]; + float preExposure; + float previousFramePreExposure; + float tanHalfFOV; + float jitterPhaseCount; + float deltaTime; + float dynamicResChangeFactor; + float viewSpaceToMetersFactor; + + // -- GODOT start -- + float pad; + float reprojectionMatrix[16]; + // -- GODOT end -- +} Fsr2Constants; + +struct FfxFsr2ContextDescription; +struct FfxDeviceCapabilities; +struct FfxPipelineState; +struct FfxResource; + +// FfxFsr2Context_Private +// The private implementation of the FSR2 context. +typedef struct FfxFsr2Context_Private { + + FfxFsr2ContextDescription contextDescription; + Fsr2Constants constants; + FfxDevice device; + FfxDeviceCapabilities deviceCapabilities; + FfxPipelineState pipelineDepthClip; + FfxPipelineState pipelineReconstructPreviousDepth; + FfxPipelineState pipelineLock; + FfxPipelineState pipelineAccumulate; + FfxPipelineState pipelineAccumulateSharpen; + FfxPipelineState pipelineRCAS; + FfxPipelineState pipelineComputeLuminancePyramid; + FfxPipelineState pipelineGenerateReactive; + FfxPipelineState pipelineTcrAutogenerate; + + // 2 arrays of resources, as e.g. FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS will use different resources when bound as SRV vs when bound as UAV + FfxResourceInternal srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_COUNT]; + FfxResourceInternal uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_COUNT]; + + bool firstExecution; + bool refreshPipelineStates; + uint32_t resourceFrameIndex; + float previousJitterOffset[2]; + int32_t jitterPhaseCountRemaining; +} FfxFsr2Context_Private; diff --git a/thirdparty/amd-fsr2/ffx_types.h b/thirdparty/amd-fsr2/ffx_types.h new file mode 100644 index 00000000000..8b65219b50e --- /dev/null +++ b/thirdparty/amd-fsr2/ffx_types.h @@ -0,0 +1,367 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include +// -- GODOT start -- +#include +// -- GODOT end -- + +#if defined (FFX_GCC) +/// FidelityFX exported functions +#define FFX_API +#else +/// FidelityFX exported functions +#define FFX_API __declspec(dllexport) +#endif // #if defined (FFX_GCC) + +/// Maximum supported number of simultaneously bound SRVs. +#define FFX_MAX_NUM_SRVS 16 + +/// Maximum supported number of simultaneously bound UAVs. +#define FFX_MAX_NUM_UAVS 8 + +/// Maximum number of constant buffers bound. +#define FFX_MAX_NUM_CONST_BUFFERS 2 + +/// Maximum size of bound constant buffers. +#define FFX_MAX_CONST_SIZE 64 + +/// Off by default warnings +#if defined(_MSC_VER) +#pragma warning(disable : 4365 4710 4820 5039) +#elif defined(__clang__) +#pragma clang diagnostic ignored "-Wunused-parameter" +#pragma clang diagnostic ignored "-Wmissing-field-initializers" +#pragma clang diagnostic ignored "-Wsign-compare" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wignored-qualifiers" +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wunused-function" +#endif + +#ifdef __cplusplus +extern "C" { +#endif // #ifdef __cplusplus + +/// An enumeration of surface formats. +typedef enum FfxSurfaceFormat { + + FFX_SURFACE_FORMAT_UNKNOWN, ///< Unknown format + FFX_SURFACE_FORMAT_R32G32B32A32_TYPELESS, ///< 32 bit per channel, 4 channel typeless format + FFX_SURFACE_FORMAT_R32G32B32A32_FLOAT, ///< 32 bit per channel, 4 channel float format + FFX_SURFACE_FORMAT_R16G16B16A16_FLOAT, ///< 16 bit per channel, 4 channel float format + FFX_SURFACE_FORMAT_R16G16B16A16_UNORM, ///< 16 bit per channel, 4 channel unsigned normalized format + FFX_SURFACE_FORMAT_R32G32_FLOAT, ///< 32 bit per channel, 2 channel float format + FFX_SURFACE_FORMAT_R32_UINT, ///< 32 bit per channel, 1 channel float format + FFX_SURFACE_FORMAT_R8G8B8A8_TYPELESS, ///< 8 bit per channel, 4 channel float format + FFX_SURFACE_FORMAT_R8G8B8A8_UNORM, ///< 8 bit per channel, 4 channel unsigned normalized format + FFX_SURFACE_FORMAT_R11G11B10_FLOAT, ///< 32 bit 3 channel float format + FFX_SURFACE_FORMAT_R16G16_FLOAT, ///< 16 bit per channel, 2 channel float format + FFX_SURFACE_FORMAT_R16G16_UINT, ///< 16 bit per channel, 2 channel unsigned int format + FFX_SURFACE_FORMAT_R16_FLOAT, ///< 16 bit per channel, 1 channel float format + FFX_SURFACE_FORMAT_R16_UINT, ///< 16 bit per channel, 1 channel unsigned int format + FFX_SURFACE_FORMAT_R16_UNORM, ///< 16 bit per channel, 1 channel unsigned normalized format + FFX_SURFACE_FORMAT_R16_SNORM, ///< 16 bit per channel, 1 channel signed normalized format + FFX_SURFACE_FORMAT_R8_UNORM, ///< 8 bit per channel, 1 channel unsigned normalized format + FFX_SURFACE_FORMAT_R8_UINT, ///< 8 bit per channel, 1 channel unsigned int format + FFX_SURFACE_FORMAT_R8G8_UNORM, ///< 8 bit per channel, 2 channel unsigned normalized format + FFX_SURFACE_FORMAT_R32_FLOAT ///< 32 bit per channel, 1 channel float format +} FfxSurfaceFormat; + +/// An enumeration of resource usage. +typedef enum FfxResourceUsage { + + FFX_RESOURCE_USAGE_READ_ONLY = 0, ///< No usage flags indicate a resource is read only. + FFX_RESOURCE_USAGE_RENDERTARGET = (1<<0), ///< Indicates a resource will be used as render target. + FFX_RESOURCE_USAGE_UAV = (1<<1), ///< Indicates a resource will be used as UAV. +} FfxResourceUsage; + +/// An enumeration of resource states. +typedef enum FfxResourceStates { + + FFX_RESOURCE_STATE_UNORDERED_ACCESS = (1<<0), ///< Indicates a resource is in the state to be used as UAV. + FFX_RESOURCE_STATE_COMPUTE_READ = (1 << 1), ///< Indicates a resource is in the state to be read by compute shaders. + FFX_RESOURCE_STATE_COPY_SRC = (1 << 2), ///< Indicates a resource is in the state to be used as source in a copy command. + FFX_RESOURCE_STATE_COPY_DEST = (1 << 3), ///< Indicates a resource is in the state to be used as destination in a copy command. + FFX_RESOURCE_STATE_GENERIC_READ = (FFX_RESOURCE_STATE_COPY_SRC | FFX_RESOURCE_STATE_COMPUTE_READ), ///< Indicates a resource is in generic (slow) read state. +} FfxResourceStates; + +/// An enumeration of surface dimensions. +typedef enum FfxResourceDimension { + + FFX_RESOURCE_DIMENSION_TEXTURE_1D, ///< A resource with a single dimension. + FFX_RESOURCE_DIMENSION_TEXTURE_2D, ///< A resource with two dimensions. +} FfxResourceDimension; + +/// An enumeration of surface dimensions. +typedef enum FfxResourceFlags { + + FFX_RESOURCE_FLAGS_NONE = 0, ///< No flags. + FFX_RESOURCE_FLAGS_ALIASABLE = (1<<0), ///< A bit indicating a resource does not need to persist across frames. +} FfxResourceFlags; + +/// An enumeration of all resource view types. +typedef enum FfxResourceViewType { + + FFX_RESOURCE_VIEW_UNORDERED_ACCESS, ///< The resource view is an unordered access view (UAV). + FFX_RESOURCE_VIEW_SHADER_READ, ///< The resource view is a shader resource view (SRV). +} FfxResourceViewType; + +/// The type of filtering to perform when reading a texture. +typedef enum FfxFilterType { + + FFX_FILTER_TYPE_POINT, ///< Point sampling. + FFX_FILTER_TYPE_LINEAR ///< Sampling with interpolation. +} FfxFilterType; + +/// An enumeration of all supported shader models. +typedef enum FfxShaderModel { + + FFX_SHADER_MODEL_5_1, ///< Shader model 5.1. + FFX_SHADER_MODEL_6_0, ///< Shader model 6.0. + FFX_SHADER_MODEL_6_1, ///< Shader model 6.1. + FFX_SHADER_MODEL_6_2, ///< Shader model 6.2. + FFX_SHADER_MODEL_6_3, ///< Shader model 6.3. + FFX_SHADER_MODEL_6_4, ///< Shader model 6.4. + FFX_SHADER_MODEL_6_5, ///< Shader model 6.5. + FFX_SHADER_MODEL_6_6, ///< Shader model 6.6. + FFX_SHADER_MODEL_6_7, ///< Shader model 6.7. +} FfxShaderModel; + +// An enumeration for different resource types +typedef enum FfxResourceType { + + FFX_RESOURCE_TYPE_BUFFER, ///< The resource is a buffer. + FFX_RESOURCE_TYPE_TEXTURE1D, ///< The resource is a 1-dimensional texture. + FFX_RESOURCE_TYPE_TEXTURE2D, ///< The resource is a 2-dimensional texture. + FFX_RESOURCE_TYPE_TEXTURE3D, ///< The resource is a 3-dimensional texture. +} FfxResourceType; + +/// An enumeration for different heap types +typedef enum FfxHeapType { + + FFX_HEAP_TYPE_DEFAULT = 0, ///< Local memory. + FFX_HEAP_TYPE_UPLOAD ///< Heap used for uploading resources. +} FfxHeapType; + +/// An enumberation for different render job types +typedef enum FfxGpuJobType { + + FFX_GPU_JOB_CLEAR_FLOAT = 0, ///< The GPU job is performing a floating-point clear. + FFX_GPU_JOB_COPY = 1, ///< The GPU job is performing a copy. + FFX_GPU_JOB_COMPUTE = 2, ///< The GPU job is performing a compute dispatch. +} FfxGpuJobType; + +/// A typedef representing the graphics device. +typedef void* FfxDevice; + +/// A typedef representing a command list or command buffer. +typedef void* FfxCommandList; + +/// A typedef for a root signature. +typedef void* FfxRootSignature; + +/// A typedef for a pipeline state object. +typedef void* FfxPipeline; + +/// A structure encapasulating a collection of device capabilities. +typedef struct FfxDeviceCapabilities { + + FfxShaderModel minimumSupportedShaderModel; ///< The minimum shader model supported by the device. + uint32_t waveLaneCountMin; ///< The minimum supported wavefront width. + uint32_t waveLaneCountMax; ///< The maximum supported wavefront width. + bool fp16Supported; ///< The device supports FP16 in hardware. + bool raytracingSupported; ///< The device supports raytracing. +} FfxDeviceCapabilities; + +/// A structure encapsulating a 2-dimensional point, using 32bit unsigned integers. +typedef struct FfxDimensions2D { + + uint32_t width; ///< The width of a 2-dimensional range. + uint32_t height; ///< The height of a 2-dimensional range. +} FfxDimensions2D; + +/// A structure encapsulating a 2-dimensional point, +typedef struct FfxIntCoords2D { + + int32_t x; ///< The x coordinate of a 2-dimensional point. + int32_t y; ///< The y coordinate of a 2-dimensional point. +} FfxIntCoords2D; + +/// A structure encapsulating a 2-dimensional set of floating point coordinates. +typedef struct FfxFloatCoords2D { + + float x; ///< The x coordinate of a 2-dimensional point. + float y; ///< The y coordinate of a 2-dimensional point. +} FfxFloatCoords2D; + +/// A structure describing a resource. +typedef struct FfxResourceDescription { + + FfxResourceType type; ///< The type of the resource. + FfxSurfaceFormat format; ///< The surface format. + uint32_t width; ///< The width of the resource. + uint32_t height; ///< The height of the resource. + uint32_t depth; ///< The depth of the resource. + uint32_t mipCount; ///< Number of mips (or 0 for full mipchain). + FfxResourceFlags flags; ///< A set of FfxResourceFlags flags. +} FfxResourceDescription; + +/// An outward facing structure containing a resource +typedef struct FfxResource { + void* resource; ///< pointer to the resource. + wchar_t name[64]; + FfxResourceDescription description; + FfxResourceStates state; + bool isDepth; + uint64_t descriptorData; +} FfxResource; + +/// An internal structure containing a handle to a resource and resource views +typedef struct FfxResourceInternal { + int32_t internalIndex; ///< The index of the resource. +} FfxResourceInternal; + + +/// A structure defining a resource bind point +typedef struct FfxResourceBinding +{ + uint32_t slotIndex; + uint32_t resourceIdentifier; + wchar_t name[64]; +}FfxResourceBinding; + +/// A structure encapsulating a single pass of an algorithm. +typedef struct FfxPipelineState { + + FfxRootSignature rootSignature; ///< The pipelines rootSignature + FfxPipeline pipeline; ///< The pipeline object + uint32_t uavCount; ///< Count of UAVs used in this pipeline + uint32_t srvCount; ///< Count of SRVs used in this pipeline + uint32_t constCount; ///< Count of constant buffers used in this pipeline + + FfxResourceBinding uavResourceBindings[FFX_MAX_NUM_UAVS]; ///< Array of ResourceIdentifiers bound as UAVs + FfxResourceBinding srvResourceBindings[FFX_MAX_NUM_SRVS]; ///< Array of ResourceIdentifiers bound as SRVs + FfxResourceBinding cbResourceBindings[FFX_MAX_NUM_CONST_BUFFERS]; ///< Array of ResourceIdentifiers bound as CBs +} FfxPipelineState; + +/// A structure containing the data required to create a resource. +typedef struct FfxCreateResourceDescription { + + FfxHeapType heapType; ///< The heap type to hold the resource, typically FFX_HEAP_TYPE_DEFAULT. + FfxResourceDescription resourceDescription; ///< A resource description. + FfxResourceStates initalState; ///< The initial resource state. + uint32_t initDataSize; ///< Size of initial data buffer. + void* initData; ///< Buffer containing data to fill the resource. + const wchar_t* name; ///< Name of the resource. + FfxResourceUsage usage; ///< Resource usage flags. + uint32_t id; ///< Internal resource ID. +} FfxCreateResourceDescription; + +/// A structure containing the description used to create a +/// FfxPipeline structure. +/// +/// A pipeline is the name given to a shader and the collection of state that +/// is required to dispatch it. In the context of FSR2 and its architecture +/// this means that a FfxPipelineDescription will map to either a +/// monolithic object in an explicit API (such as a +/// PipelineStateObject in DirectX 12). Or a shader and some +/// ancillary API objects (in something like DirectX 11). +/// +/// The contextFlags field contains a copy of the flags passed +/// to ffxFsr2ContextCreate via the flags field of +/// the FfxFsr2InitializationParams structure. These flags are +/// used to determine which permutation of a pipeline for a specific +/// FfxFsr2Pass should be used to implement the features required +/// by each application, as well as to acheive the best performance on specific +/// target hardware configurations. +/// +/// When using one of the provided backends for FSR2 (such as DirectX 12 or +/// Vulkan) the data required to create a pipeline is compiled offline and +/// included into the backend library that you are using. For cases where the +/// backend interface is overriden by providing custom callback function +/// implementations care should be taken to respect the contents of the +/// contextFlags field in order to correctly support the options +/// provided by FSR2, and acheive best performance. +/// +/// @ingroup FSR2 +typedef struct FfxPipelineDescription { + + uint32_t contextFlags; ///< A collection of FfxFsr2InitializationFlagBits which were passed to the context. + FfxFilterType* samplers; ///< Array of static samplers. + size_t samplerCount; ///< The number of samples contained inside samplers. + const uint32_t* rootConstantBufferSizes; ///< Array containing the sizes of the root constant buffers (count of 32 bit elements). + uint32_t rootConstantBufferCount; ///< The number of root constants contained within rootConstantBufferSizes. +} FfxPipelineDescription; + +/// A structure containing a constant buffer. +typedef struct FfxConstantBuffer { + + uint32_t uint32Size; ///< Size of 32 bit chunks used in the constant buffer + uint32_t data[FFX_MAX_CONST_SIZE]; ///< Constant buffer data +}FfxConstantBuffer; + +/// A structure describing a clear render job. +typedef struct FfxClearFloatJobDescription { + + float color[4]; ///< The clear color of the resource. + FfxResourceInternal target; ///< The resource to be cleared. +} FfxClearFloatJobDescription; + +/// A structure describing a compute render job. +typedef struct FfxComputeJobDescription { + + FfxPipelineState pipeline; ///< Compute pipeline for the render job. + uint32_t dimensions[3]; ///< Dispatch dimensions. + FfxResourceInternal srvs[FFX_MAX_NUM_SRVS]; ///< SRV resources to be bound in the compute job. + wchar_t srvNames[FFX_MAX_NUM_SRVS][64]; + FfxResourceInternal uavs[FFX_MAX_NUM_UAVS]; ///< UAV resources to be bound in the compute job. + uint32_t uavMip[FFX_MAX_NUM_UAVS]; ///< Mip level of UAV resources to be bound in the compute job. + wchar_t uavNames[FFX_MAX_NUM_UAVS][64]; + FfxConstantBuffer cbs[FFX_MAX_NUM_CONST_BUFFERS]; ///< Constant buffers to be bound in the compute job. + wchar_t cbNames[FFX_MAX_NUM_CONST_BUFFERS][64]; + uint32_t cbSlotIndex[FFX_MAX_NUM_CONST_BUFFERS]; ///< Slot index in the descriptor table +} FfxComputeJobDescription; + +/// A structure describing a copy render job. +typedef struct FfxCopyJobDescription +{ + FfxResourceInternal src; ///< Source resource for the copy. + FfxResourceInternal dst; ///< Destination resource for the copy. +} FfxCopyJobDescription; + +/// A structure describing a single render job. +typedef struct FfxGpuJobDescription{ + + FfxGpuJobType jobType; ///< Type of the job. + + union { + FfxClearFloatJobDescription clearJobDescriptor; ///< Clear job descriptor. Valid when jobType is FFX_RENDER_JOB_CLEAR_FLOAT. + FfxCopyJobDescription copyJobDescriptor; ///< Copy job descriptor. Valid when jobType is FFX_RENDER_JOB_COPY. + FfxComputeJobDescription computeJobDescriptor; ///< Compute job descriptor. Valid when jobType is FFX_RENDER_JOB_COMPUTE. + }; +} FfxGpuJobDescription; + +#ifdef __cplusplus +} +#endif // #ifdef __cplusplus diff --git a/thirdparty/amd-fsr2/ffx_util.h b/thirdparty/amd-fsr2/ffx_util.h new file mode 100644 index 00000000000..ca4324ea832 --- /dev/null +++ b/thirdparty/amd-fsr2/ffx_util.h @@ -0,0 +1,78 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "ffx_types.h" + +/// The value of Pi. +const float FFX_PI = 3.141592653589793f; + +/// An epsilon value for floating point numbers. +const float FFX_EPSILON = 1e-06f; + +/// Helper macro to create the version number. +#define FFX_MAKE_VERSION(major, minor, patch) ((major << 22) | (minor << 12) | patch) + +///< Use this to specify no version. +#define FFX_UNSPECIFIED_VERSION 0xFFFFAD00 + +/// Helper macro to avoid warnings about unused variables. +#define FFX_UNUSED(x) ((void)(x)) + +/// Helper macro to align an integer to the specified power of 2 boundary +#define FFX_ALIGN_UP(x, y) (((x) + ((y)-1)) & ~((y)-1)) + +/// Helper macro to check if a value is aligned. +#define FFX_IS_ALIGNED(x) (((x) != 0) && ((x) & ((x)-1))) + +/// Helper macro to stringify a value. +#define FFX_STR(s) FFX_XSTR(s) +#define FFX_XSTR(s) #s + +/// Helper macro to forward declare a structure. +#define FFX_FORWARD_DECLARE(x) typedef struct x x + +/// Helper macro to return the maximum of two values. +#define FFX_MAXIMUM(x, y) (((x) > (y)) ? (x) : (y)) + +/// Helper macro to return the minimum of two values. +#define FFX_MINIMUM(x, y) (((x) < (y)) ? (x) : (y)) + +/// Helper macro to do safe free on a pointer. +#define FFX_SAFE_FREE(x) \ + if (x) \ + free(x) + +/// Helper macro to return the abs of an integer value. +#define FFX_ABSOLUTE(x) (((x) < 0) ? (-(x)) : (x)) + +/// Helper macro to return sign of a value. +#define FFX_SIGN(x) (((x) < 0) ? -1 : 1) + +/// Helper macro to work out the number of elements in an array. +#define FFX_ARRAY_ELEMENTS(x) (int32_t)((sizeof(x) / sizeof(0 [x])) / ((size_t)(!(sizeof(x) % sizeof(0 [x]))))) + +/// The maximum length of a path that can be specified to the FidelityFX API. +#define FFX_MAXIMUM_PATH (260) + +/// Helper macro to check if the specified key is set in a bitfield. +#define FFX_CONTAINS_FLAG(options, key) ((options & key) == key) diff --git a/thirdparty/amd-fsr2/patches/godot-changes.patch b/thirdparty/amd-fsr2/patches/godot-changes.patch new file mode 100644 index 0000000000000000000000000000000000000000..513d8a5a29530ba70954a7e4ad2f7b16da1e6845 GIT binary patch literal 21350 zcmeI4ZExGi5y$tf3-mh>cW-oc?D+Yi*B0m6ahf(xTwBStJrshhhuA&aab-JodbqFN z_W#?_iWDW9q)Z7toIpq-m&@Im*_r>&%r5oc|FqqO>*|Qyl8z6$y3={nJ=gEKu6nNR zZgkz#^=Fkf-IYp~^u(>o)>OjN53VsNS#~WwaidnwDrn`G?iZrsT77oZOXODErd!qb zs@qY?hFf>rZdu2U+tO86S6$s(bGy2Ve9aYqXT#AFJT6tlAMGvrqdHE%ao>o#C2_Uo zp6L^*eLG(jXOZrHGlzmp#QU2S~jd7|fGE*A&oX}XWxsoNL*@7)V`lBjj<$Kl>{5V_yDf9d+% z)4QzS-wkRGb4aaQeQv~2LuZkHE|dh>rD@BOz12&~-A2snqb=CcKPC5$|DX$9uS-`d zq4%!W67-1cme+_IKSn*>rG;yKKIkq_FnZR5je5{G#`apzgD3ZXH1DL#YvPcSLrL(& zUmdwU_ZOA^*}aoA`5fr#P zZ47iBioXZ7+tYzH?CTp&;oC6nK-Au9G@7E6@vDg%o;mE_=Lxt5k?ut6v6m3GL)ov? zlJOpDJR6jlG=^I{=<^h*7RT_jOVS!EG3nCpt?=$rGG!!=MC*+tcF`XTblv1`+HJB+ z%fD8z7t@v#Kg!6cr_nH53$0^cR|6V8$(F(xRnO0=taW<)SnJvRm>n(%@ycLckOG!+ zC8=ZMMWqi?gYqBUKXlg8$lKT+c})YSAEfD#*HrNEN}7Sb6n$sG>pm5AP49VrU8p@U;<@~+NWMbky;tT%P=eOs zIkZQ~xxbT*D8o2XHE*EygOH@Cobq@xkN-+zT%0$X!vY0{uoe;Jd+*_?N#%mv^JcT} zWTRV}#rq&hU?~ zq)lP=MbB(DX4c@jGg+GT%X36vLD#T7Jm+)aPM-geH6Q+{_9n5`;D?!3o7Busn70BF z-0KW`p?7TMMjYLXZzP)L#_|C886Dll8%#zE!KQ6AEaoIeAd15~iRDL}Mvfu!eJAZd zp539k`kAKTHSofZEgTS)J#FS0*CMnhXQC}>$fkHclZ44$5X&1 z_!q2<%)z9HH{3=VrLMe3-LBEvG$@ zXuuf}pXCEXUKX#2yx5`7-Mtj9?f3Jj#Fj?;p?WrS#jpD8>*8rV2%Eq>Y{0UQQ!o{sW);L@#*WuY{l;iY&j6P;GqZ`#VX`gYTm z+*YKI*c?{Y*7>@(sIE$|y_RUM!pf@UWfIG*mV#i-pwTn=VXUyk=dm@L2ESE4x~@8J zrMrBe==+s>qwjrPz0vVPXLXfv6)BVSN>_i-8Q(ATd+ML9>&~8@ro>ta_ur{TT*~SR z&(Y6;e)rsK-``8svYrh`jJnW9x(|yr%zBr!szJPh1%xPm!=o@Q*~}l?xn;D$fw=Zd z(a`X#J8$*VEv*__1q9e~xu`-(Rt^5xkrtzv~11El$B}>DRvdlc%Y!UhAR&-I?qNtEOdTaO~}- z;dS`f>vTg@q07f5DcPJr2FKpgj>SC|$M`TVfhH^$DPdzpj;#qOcrE@}*Rq~ZC4oJM zwjhDJ`)i?oW7@VSqF_%v@4LT=(^Js_MIhN^$Bw5U*bv@a@PvqGS#4WW%dr&&OV8Ka zeaVH@K3ff(jJnaM*d1Ur7|I!57R_|oIyIJV-c6vcAz7l$hW3Vuw&HD?exf_92-~Q? z*O)Wj=--i^jGvggO^^^W@-k1|1UWmZ(tcu?MMizl zFNB-JyF*y>C$DxXkBa#8dynhE&%*zW_Z*}>+VQP@)J#Es?%Mmf<(^K?GjZRu@jNX9 zZW*$MpD8UHKz3?YUTAq5uqVa^%O=~ptx;qj4l6BPd7v$w;gNO}^{r`)$@pygh;T!- zXXTMj#@DpG)5)j_Xd8Q{b5Hf>;Hl06eey84>UcM=zci|V6Kx^Os-jx*<~w~<0ZZ0| zIb@RBy0_^gxY?je5gw+4AVqo(Xff=Y;itvfcJqi9v?n91Cxz8pc1Q$op(~FiD{8ry<&MBYOrd{*@ILq>8#^<*4X*!mE#@TD?+sYU7+~{#7UoU_C)AGLTn#ra& z?R`b6OK!R)n+RTG$VQJwv&bXp%DQx8I}r_ARuh|x`gO+Z$!mFA@cv4Ez>OmCJFhXe z&KdT~anE+%Jb2x@keB+P4?9wAza^3NxV$1QguQ{IzAq8op7)-EuT7kFs)(1ppr+xE zQ_VOeJuTMd*-uo4@*nzoRZgb@zh&ZzY3YK+DOu+-XF5lD_TByY@SU@YS&C zLx11)H1-siP(LY)hSy=F!M-w+q~3~TzpS?Gd|Bw+RY}lgbdW4+$b>KW zUSldPr!#%(lJwxeqI-r8S)({=X+QQgEYGfN4{E%Bo17Cd>0@?fb~0*PN3xl)cK%*c zBu`HE{BeCf5P!CcPmVpSjbVEZhQ0O+{k>#1 z*NxLR_U{oFeT;L8-np+5>Q>N-Fm%X#PJ9Bl@C%0Axw zl{xlevyOw0NKVV%T3`-f4@}FKu_NAgud`Y_i`Ca^sKR%ElaL8p7eBi`GTrs@>aKVs z6Skr&dsl5u7H$#jtTJKMaXPKYwhH}7#x(P6nfbJQ#~6~m;>y0?pYpoyEKm#!H~)jx z3-;68%mk;3_M1i`-g)Tw{UE#gCPQyIdG=xPw!@CRwJl!<^q)=ktPC&H!J0A@gbc|X zP3CA)?rr#dp9sI7bd}7(k*>vg${Z2PX`_HdYz-g933Y6C!HebKz=s73l-ks11ng`;oTqJXa_x# z-tP4Gv(tMObd5tD{0`z7_TcWQ7f^>x3OkLs<~ zzw59H9o`%cdm*QfSH*iyBhs00i1&D#@_&ed$oPUt?0{ysBcB!3!CRUYB9-H2I+#PIJW(Z#J2n#DAh7fF8w5q(I2nF-fIU;g2bl_sO^dgd-fuIyldDV#Tp0y?{}lEh h6p>W&ZKtZ TypeName; +#define FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix TypeName; + +#define FFX_16BIT_SCALAR( TypeName, BaseComponentType ) typedef BaseComponentType##16_t TypeName; +#define FFX_16BIT_VECTOR( TypeName, BaseComponentType, COL ) typedef vector TypeName; +#define FFX_16BIT_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix TypeName; + +#else //FFX_HLSL_6_2 + +#define FFX_MIN16_SCALAR( TypeName, BaseComponentType ) typedef min16##BaseComponentType TypeName; +#define FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL ) typedef vector TypeName; +#define FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix TypeName; + +#define FFX_16BIT_SCALAR( TypeName, BaseComponentType ) FFX_MIN16_SCALAR( TypeName, BaseComponentType ); +#define FFX_16BIT_VECTOR( TypeName, BaseComponentType, COL ) FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL ); +#define FFX_16BIT_MATRIX( TypeName, BaseComponentType, ROW, COL ) FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ); + +#endif //FFX_HLSL_6_2 + +#else //FFX_HALF + +#define FFX_MIN16_SCALAR( TypeName, BaseComponentType ) typedef BaseComponentType TypeName; +#define FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL ) typedef vector TypeName; +#define FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix TypeName; + +#define FFX_16BIT_SCALAR( TypeName, BaseComponentType ) typedef BaseComponentType TypeName; +#define FFX_16BIT_VECTOR( TypeName, BaseComponentType, COL ) typedef vector TypeName; +#define FFX_16BIT_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix TypeName; + +#endif //FFX_HALF + +#if defined(FFX_GPU) +// Common typedefs: +#if defined(FFX_HLSL) +FFX_MIN16_SCALAR( FFX_MIN16_F , float ); +FFX_MIN16_VECTOR( FFX_MIN16_F2, float, 2 ); +FFX_MIN16_VECTOR( FFX_MIN16_F3, float, 3 ); +FFX_MIN16_VECTOR( FFX_MIN16_F4, float, 4 ); + +FFX_MIN16_SCALAR( FFX_MIN16_I, int ); +FFX_MIN16_VECTOR( FFX_MIN16_I2, int, 2 ); +FFX_MIN16_VECTOR( FFX_MIN16_I3, int, 3 ); +FFX_MIN16_VECTOR( FFX_MIN16_I4, int, 4 ); + +FFX_MIN16_SCALAR( FFX_MIN16_U, uint ); +FFX_MIN16_VECTOR( FFX_MIN16_U2, uint, 2 ); +FFX_MIN16_VECTOR( FFX_MIN16_U3, uint, 3 ); +FFX_MIN16_VECTOR( FFX_MIN16_U4, uint, 4 ); + +FFX_16BIT_SCALAR( FFX_F16_t , float ); +FFX_16BIT_VECTOR( FFX_F16_t2, float, 2 ); +FFX_16BIT_VECTOR( FFX_F16_t3, float, 3 ); +FFX_16BIT_VECTOR( FFX_F16_t4, float, 4 ); + +FFX_16BIT_SCALAR( FFX_I16_t, int ); +FFX_16BIT_VECTOR( FFX_I16_t2, int, 2 ); +FFX_16BIT_VECTOR( FFX_I16_t3, int, 3 ); +FFX_16BIT_VECTOR( FFX_I16_t4, int, 4 ); + +FFX_16BIT_SCALAR( FFX_U16_t, uint ); +FFX_16BIT_VECTOR( FFX_U16_t2, uint, 2 ); +FFX_16BIT_VECTOR( FFX_U16_t3, uint, 3 ); +FFX_16BIT_VECTOR( FFX_U16_t4, uint, 4 ); + +#define TYPEDEF_MIN16_TYPES(Prefix) \ +typedef FFX_MIN16_F Prefix##_F; \ +typedef FFX_MIN16_F2 Prefix##_F2; \ +typedef FFX_MIN16_F3 Prefix##_F3; \ +typedef FFX_MIN16_F4 Prefix##_F4; \ +typedef FFX_MIN16_I Prefix##_I; \ +typedef FFX_MIN16_I2 Prefix##_I2; \ +typedef FFX_MIN16_I3 Prefix##_I3; \ +typedef FFX_MIN16_I4 Prefix##_I4; \ +typedef FFX_MIN16_U Prefix##_U; \ +typedef FFX_MIN16_U2 Prefix##_U2; \ +typedef FFX_MIN16_U3 Prefix##_U3; \ +typedef FFX_MIN16_U4 Prefix##_U4; + +#define TYPEDEF_16BIT_TYPES(Prefix) \ +typedef FFX_16BIT_F Prefix##_F; \ +typedef FFX_16BIT_F2 Prefix##_F2; \ +typedef FFX_16BIT_F3 Prefix##_F3; \ +typedef FFX_16BIT_F4 Prefix##_F4; \ +typedef FFX_16BIT_I Prefix##_I; \ +typedef FFX_16BIT_I2 Prefix##_I2; \ +typedef FFX_16BIT_I3 Prefix##_I3; \ +typedef FFX_16BIT_I4 Prefix##_I4; \ +typedef FFX_16BIT_U Prefix##_U; \ +typedef FFX_16BIT_U2 Prefix##_U2; \ +typedef FFX_16BIT_U3 Prefix##_U3; \ +typedef FFX_16BIT_U4 Prefix##_U4; + +#define TYPEDEF_FULL_PRECISION_TYPES(Prefix) \ +typedef FfxFloat32 Prefix##_F; \ +typedef FfxFloat32x2 Prefix##_F2; \ +typedef FfxFloat32x3 Prefix##_F3; \ +typedef FfxFloat32x4 Prefix##_F4; \ +typedef FfxInt32 Prefix##_I; \ +typedef FfxInt32x2 Prefix##_I2; \ +typedef FfxInt32x3 Prefix##_I3; \ +typedef FfxInt32x4 Prefix##_I4; \ +typedef FfxUInt32 Prefix##_U; \ +typedef FfxUInt32x2 Prefix##_U2; \ +typedef FfxUInt32x3 Prefix##_U3; \ +typedef FfxUInt32x4 Prefix##_U4; +#endif // #if defined(FFX_HLSL) + +#if defined(FFX_GLSL) + +#if FFX_HALF + +#define FFX_MIN16_F float16_t +#define FFX_MIN16_F2 f16vec2 +#define FFX_MIN16_F3 f16vec3 +#define FFX_MIN16_F4 f16vec4 + +#define FFX_MIN16_I int16_t +#define FFX_MIN16_I2 i16vec2 +#define FFX_MIN16_I3 i16vec3 +#define FFX_MIN16_I4 i16vec4 + +#define FFX_MIN16_U uint16_t +#define FFX_MIN16_U2 u16vec2 +#define FFX_MIN16_U3 u16vec3 +#define FFX_MIN16_U4 u16vec4 + +#define FFX_16BIT_F float16_t +#define FFX_16BIT_F2 f16vec2 +#define FFX_16BIT_F3 f16vec3 +#define FFX_16BIT_F4 f16vec4 + +#define FFX_16BIT_I int16_t +#define FFX_16BIT_I2 i16vec2 +#define FFX_16BIT_I3 i16vec3 +#define FFX_16BIT_I4 i16vec4 + +#define FFX_16BIT_U uint16_t +#define FFX_16BIT_U2 u16vec2 +#define FFX_16BIT_U3 u16vec3 +#define FFX_16BIT_U4 u16vec4 + +#else // FFX_HALF + +#define FFX_MIN16_F float +#define FFX_MIN16_F2 vec2 +#define FFX_MIN16_F3 vec3 +#define FFX_MIN16_F4 vec4 + +#define FFX_MIN16_I int +#define FFX_MIN16_I2 ivec2 +#define FFX_MIN16_I3 ivec3 +#define FFX_MIN16_I4 ivec4 + +#define FFX_MIN16_U uint +#define FFX_MIN16_U2 uvec2 +#define FFX_MIN16_U3 uvec3 +#define FFX_MIN16_U4 uvec4 + +#define FFX_16BIT_F float +#define FFX_16BIT_F2 vec2 +#define FFX_16BIT_F3 vec3 +#define FFX_16BIT_F4 vec4 + +#define FFX_16BIT_I int +#define FFX_16BIT_I2 ivec2 +#define FFX_16BIT_I3 ivec3 +#define FFX_16BIT_I4 ivec4 + +#define FFX_16BIT_U uint +#define FFX_16BIT_U2 uvec2 +#define FFX_16BIT_U3 uvec3 +#define FFX_16BIT_U4 uvec4 + +#endif // FFX_HALF + +#endif // #if defined(FFX_GLSL) + +#endif // #if defined(FFX_GPU) +#endif // #ifndef FFX_COMMON_TYPES_H diff --git a/thirdparty/amd-fsr2/shaders/ffx_core.h b/thirdparty/amd-fsr2/shaders/ffx_core.h new file mode 100644 index 00000000000..4e687d6e3d6 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_core.h @@ -0,0 +1,52 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +/// @defgroup Core +/// @defgroup HLSL +/// @defgroup GLSL +/// @defgroup GPU +/// @defgroup CPU +/// @defgroup CAS +/// @defgroup FSR1 + +#if !defined(FFX_CORE_H) +#define FFX_CORE_H + +#include "ffx_common_types.h" + +#if defined(FFX_CPU) + #include "ffx_core_cpu.h" +#endif // #if defined(FFX_CPU) + +#if defined(FFX_GLSL) && defined(FFX_GPU) + #include "ffx_core_glsl.h" +#endif // #if defined(FFX_GLSL) && defined(FFX_GPU) + +#if defined(FFX_HLSL) && defined(FFX_GPU) + #include "ffx_core_hlsl.h" +#endif // #if defined(FFX_HLSL) && defined(FFX_GPU) + +#if defined(FFX_GPU) + #include "ffx_core_gpu_common.h" + #include "ffx_core_gpu_common_half.h" + #include "ffx_core_portability.h" +#endif // #if defined(FFX_GPU) +#endif // #if !defined(FFX_CORE_H) \ No newline at end of file diff --git a/thirdparty/amd-fsr2/shaders/ffx_core_cpu.h b/thirdparty/amd-fsr2/shaders/ffx_core_cpu.h new file mode 100644 index 00000000000..3bf0295bfc6 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_core_cpu.h @@ -0,0 +1,332 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +/// A define for a true value in a boolean expression. +/// +/// @ingroup CPU +#define FFX_TRUE (1) + +/// A define for a false value in a boolean expression. +/// +/// @ingroup CPU +#define FFX_FALSE (0) + +#if !defined(FFX_STATIC) +/// A define to abstract declaration of static variables and functions. +/// +/// @ingroup CPU +#define FFX_STATIC static +#endif // #if !defined(FFX_STATIC) + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wunused-variable" +#endif + +/// Interpret the bit layout of an IEEE-754 floating point value as an unsigned integer. +/// +/// @param [in] x A 32bit floating value. +/// +/// @returns +/// An unsigned 32bit integer value containing the bit pattern of x. +/// +/// @ingroup CPU +FFX_STATIC FfxUInt32 ffxAsUInt32(FfxFloat32 x) +{ + union + { + FfxFloat32 f; + FfxUInt32 u; + } bits; + + bits.f = x; + return bits.u; +} + +FFX_STATIC FfxFloat32 ffxDot2(FfxFloat32x2 a, FfxFloat32x2 b) +{ + return a[0] * b[0] + a[1] * b[1]; +} + +FFX_STATIC FfxFloat32 ffxDot3(FfxFloat32x3 a, FfxFloat32x3 b) +{ + return a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; +} + +FFX_STATIC FfxFloat32 ffxDot4(FfxFloat32x4 a, FfxFloat32x4 b) +{ + return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]; +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the GLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup CPU +FFX_STATIC FfxFloat32 ffxLerp(FfxFloat32 x, FfxFloat32 y, FfxFloat32 t) +{ + return y * t + (-x * t + x); +} + +/// Compute the reciprocal of a value. +/// +/// @param [in] x The value to compute the reciprocal for. +/// +/// @returns +/// The reciprocal value of x. +/// +/// @ingroup CPU +FFX_STATIC FfxFloat32 ffxReciprocal(FfxFloat32 a) +{ + return 1.0f / a; +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup CPU +FFX_STATIC FfxFloat32 ffxSqrt(FfxFloat32 x) +{ + return sqrt(x); +} + +FFX_STATIC FfxUInt32 AShrSU1(FfxUInt32 a, FfxUInt32 b) +{ + return FfxUInt32(FfxInt32(a) >> FfxInt32(b)); +} + +/// Compute the factional part of a decimal value. +/// +/// This function calculates x - floor(x). +/// +/// @param [in] x The value to compute the fractional part from. +/// +/// @returns +/// The fractional part of x. +/// +/// @ingroup CPU +FFX_STATIC FfxFloat32 ffxFract(FfxFloat32 a) +{ + return a - floor(a); +} + +/// Compute the reciprocal square root of a value. +/// +/// @param [in] x The value to compute the reciprocal for. +/// +/// @returns +/// The reciprocal square root value of x. +/// +/// @ingroup CPU +FFX_STATIC FfxFloat32 rsqrt(FfxFloat32 a) +{ + return ffxReciprocal(ffxSqrt(a)); +} + +FFX_STATIC FfxFloat32 ffxMin(FfxFloat32 x, FfxFloat32 y) +{ + return x < y ? x : y; +} + +FFX_STATIC FfxUInt32 ffxMin(FfxUInt32 x, FfxUInt32 y) +{ + return x < y ? x : y; +} + +FFX_STATIC FfxFloat32 ffxMax(FfxFloat32 x, FfxFloat32 y) +{ + return x > y ? x : y; +} + +FFX_STATIC FfxUInt32 ffxMax(FfxUInt32 x, FfxUInt32 y) +{ + return x > y ? x : y; +} + +/// Clamp a value to a [0..1] range. +/// +/// @param [in] x The value to clamp to [0..1] range. +/// +/// @returns +/// The clamped version of x. +/// +/// @ingroup CPU +FFX_STATIC FfxFloat32 ffxSaturate(FfxFloat32 a) +{ + return ffxMin(1.0f, ffxMax(0.0f, a)); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +FFX_STATIC void opAAddOneF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32 b) +{ + d[0] = a[0] + b; + d[1] = a[1] + b; + d[2] = a[2] + b; + return; +} + +FFX_STATIC void opACpyF3(FfxFloat32x3 d, FfxFloat32x3 a) +{ + d[0] = a[0]; + d[1] = a[1]; + d[2] = a[2]; + return; +} + +FFX_STATIC void opAMulF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32x3 b) +{ + d[0] = a[0] * b[0]; + d[1] = a[1] * b[1]; + d[2] = a[2] * b[2]; + return; +} + +FFX_STATIC void opAMulOneF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32 b) +{ + d[0] = a[0] * b; + d[1] = a[1] * b; + d[2] = a[2] * b; + return; +} + +FFX_STATIC void opARcpF3(FfxFloat32x3 d, FfxFloat32x3 a) +{ + d[0] = ffxReciprocal(a[0]); + d[1] = ffxReciprocal(a[1]); + d[2] = ffxReciprocal(a[2]); + return; +} + +/// Convert FfxFloat32 to half (in lower 16-bits of output). +/// +/// This function implements the same fast technique that is documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf +/// +/// The function supports denormals. +/// +/// Some conversion rules are to make computations possibly "safer" on the GPU, +/// -INF & -NaN -> -65504 +/// +INF & +NaN -> +65504 +/// +/// @param [in] f The 32bit floating point value to convert. +/// +/// @returns +/// The closest 16bit floating point value to f. +/// +/// @ingroup CPU +FFX_STATIC FfxUInt32 f32tof16(FfxFloat32 f) +{ + static FfxUInt16 base[512] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, + 0x0800, 0x0c00, 0x1000, 0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, + 0x5400, 0x5800, 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, + 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, + 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400, 0xa800, 0xac00, + 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, 0xf000, 0xf400, 0xf800, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, + 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff + }; + + static FfxUInt8 shift[512] = { + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18 + }; + + union + { + FfxFloat32 f; + FfxUInt32 u; + } bits; + + bits.f = f; + FfxUInt32 u = bits.u; + FfxUInt32 i = u >> 23; + return (FfxUInt32)(base[i]) + ((u & 0x7fffff) >> shift[i]); +} + +/// Pack 2x32-bit floating point values in a single 32bit value. +/// +/// This function first converts each component of value into their nearest 16-bit floating +/// point representation, and then stores the X and Y components in the lower and upper 16 bits of the +/// 32bit unsigned integer respectively. +/// +/// @param [in] value A 2-dimensional floating point value to convert and pack. +/// +/// @returns +/// A packed 32bit value containing 2 16bit floating point values. +/// +/// @ingroup CPU +FFX_STATIC FfxUInt32 packHalf2x16(FfxFloat32x2 a) +{ + return f32tof16(a[0]) + (f32tof16(a[1]) << 16); +} diff --git a/thirdparty/amd-fsr2/shaders/ffx_core_glsl.h b/thirdparty/amd-fsr2/shaders/ffx_core_glsl.h new file mode 100644 index 00000000000..6ec58f3c625 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_core_glsl.h @@ -0,0 +1,1669 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +/// A define for abstracting shared memory between shading languages. +/// +/// @ingroup GPU +#define FFX_GROUPSHARED shared + +/// A define for abstracting compute memory barriers between shading languages. +/// +/// @ingroup GPU +#define FFX_GROUP_MEMORY_BARRIER() barrier() + +/// A define added to accept static markup on functions to aid CPU/GPU portability of code. +/// +/// @ingroup GPU +#define FFX_STATIC + +/// A define for abstracting loop unrolling between shading languages. +/// +/// @ingroup GPU +#define FFX_UNROLL + +/// A define for abstracting a 'greater than' comparison operator between two types. +/// +/// @ingroup GPU +#define FFX_GREATER_THAN(x, y) greaterThan(x, y) + +/// A define for abstracting a 'greater than or equal' comparison operator between two types. +/// +/// @ingroup GPU +#define FFX_GREATER_THAN_EQUAL(x, y) greaterThanEqual(x, y) + +/// A define for abstracting a 'less than' comparison operator between two types. +/// +/// @ingroup GPU +#define FFX_LESS_THAN(x, y) lessThan(x, y) + +/// A define for abstracting a 'less than or equal' comparison operator between two types. +/// +/// @ingroup GPU +#define FFX_LESS_THAN_EQUAL(x, y) lessThanEqual(x, y) + +/// A define for abstracting an 'equal' comparison operator between two types. +/// +/// @ingroup GPU +#define FFX_EQUAL(x, y) equal(x, y) + +/// A define for abstracting a 'not equal' comparison operator between two types. +/// +/// @ingroup GPU +#define FFX_NOT_EQUAL(x, y) notEqual(x, y) + +/// Broadcast a scalar value to a 1-dimensional floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_FLOAT32(x) FfxFloat32(x) + +/// Broadcast a scalar value to a 2-dimensional floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_FLOAT32X2(x) FfxFloat32x2(FfxFloat32(x)) + +/// Broadcast a scalar value to a 3-dimensional floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_FLOAT32X3(x) FfxFloat32x3(FfxFloat32(x)) + +/// Broadcast a scalar value to a 4-dimensional floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_FLOAT32X4(x) FfxFloat32x4(FfxFloat32(x)) + +/// Broadcast a scalar value to a 1-dimensional unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_UINT32(x) FfxUInt32(x) + +/// Broadcast a scalar value to a 2-dimensional unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_UINT32X2(x) FfxUInt32x2(FfxUInt32(x)) + +/// Broadcast a scalar value to a 3-dimensional unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_UINT32X3(x) FfxUInt32x3(FfxUInt32(x)) + +/// Broadcast a scalar value to a 4-dimensional unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_UINT32X4(x) FfxUInt32x4(FfxUInt32(x)) + +/// Broadcast a scalar value to a 1-dimensional signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_INT32(x) FfxInt32(x) + +/// Broadcast a scalar value to a 2-dimensional signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_INT32X2(x) FfxInt32x2(FfxInt32(x)) + +/// Broadcast a scalar value to a 3-dimensional signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_INT32X3(x) FfxInt32x3(FfxInt32(x)) + +/// Broadcast a scalar value to a 4-dimensional signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_INT32X4(x) FfxInt32x4(FfxInt32(x)) + +/// Broadcast a scalar value to a 1-dimensional half-precision floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_FLOAT16(x) FFX_MIN16_F(x) + +/// Broadcast a scalar value to a 2-dimensional half-precision floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_FLOAT16X2(x) FFX_MIN16_F2(FFX_MIN16_F(x)) + +/// Broadcast a scalar value to a 3-dimensional half-precision floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_FLOAT16X3(x) FFX_MIN16_F3(FFX_MIN16_F(x)) + +/// Broadcast a scalar value to a 4-dimensional half-precision floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_FLOAT16X4(x) FFX_MIN16_F4(FFX_MIN16_F(x)) + +/// Broadcast a scalar value to a 1-dimensional half-precision unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_UINT16(x) FFX_MIN16_U(x) + +/// Broadcast a scalar value to a 2-dimensional half-precision unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_UINT16X2(x) FFX_MIN16_U2(FFX_MIN16_U(x)) + +/// Broadcast a scalar value to a 3-dimensional half-precision unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_UINT16X3(x) FFX_MIN16_U3(FFX_MIN16_U(x)) + +/// Broadcast a scalar value to a 4-dimensional half-precision unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_UINT16X4(x) FFX_MIN16_U4(FFX_MIN16_U(x)) + +/// Broadcast a scalar value to a 1-dimensional half-precision signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_INT16(x) FFX_MIN16_I(x) + +/// Broadcast a scalar value to a 2-dimensional half-precision signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_INT16X2(x) FFX_MIN16_I2(FFX_MIN16_I(x)) + +/// Broadcast a scalar value to a 3-dimensional half-precision signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_INT16X3(x) FFX_MIN16_I3(FFX_MIN16_I(x)) + +/// Broadcast a scalar value to a 4-dimensional half-precision signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_INT16X4(x) FFX_MIN16_I4(FFX_MIN16_I(x)) + +#if !defined(FFX_SKIP_EXT) +#if FFX_HALF + #extension GL_EXT_shader_16bit_storage : require + #extension GL_EXT_shader_explicit_arithmetic_types : require +#endif // FFX_HALF + +#if defined(FFX_LONG) + #extension GL_ARB_gpu_shader_int64 : require + #extension GL_NV_shader_atomic_int64 : require +#endif // #if defined(FFX_LONG) + +#if defined(FFX_WAVE) + #extension GL_KHR_shader_subgroup_arithmetic : require + #extension GL_KHR_shader_subgroup_ballot : require + #extension GL_KHR_shader_subgroup_quad : require + #extension GL_KHR_shader_subgroup_shuffle : require +#endif // #if defined(FFX_WAVE) +#endif // #if !defined(FFX_SKIP_EXT) + +// Forward declarations +FfxFloat32 ffxSqrt(FfxFloat32 x); +FfxFloat32x2 ffxSqrt(FfxFloat32x2 x); +FfxFloat32x3 ffxSqrt(FfxFloat32x3 x); +FfxFloat32x4 ffxSqrt(FfxFloat32x4 x); + +/// Interprets the bit pattern of x as a floating-point number. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as a floating-point number. +/// +/// @ingroup GLSL +FfxFloat32 ffxAsFloat(FfxUInt32 x) +{ + return uintBitsToFloat(x); +} + +/// Interprets the bit pattern of x as a floating-point number. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as a floating-point number. +/// +/// @ingroup GLSL +FfxFloat32x2 ffxAsFloat(FfxUInt32x2 x) +{ + return uintBitsToFloat(x); +} + +/// Interprets the bit pattern of x as a floating-point number. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as a floating-point number. +/// +/// @ingroup GLSL +FfxFloat32x3 ffxAsFloat(FfxUInt32x3 x) +{ + return uintBitsToFloat(x); +} + +/// Interprets the bit pattern of x as a floating-point number. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as a floating-point number. +/// +/// @ingroup GLSL +FfxFloat32x4 ffxAsFloat(FfxUInt32x4 x) +{ + return uintBitsToFloat(x); +} + +/// Interprets the bit pattern of x as an unsigned integer. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as an unsigned integer. +/// +/// @ingroup GLSL +FfxUInt32 ffxAsUInt32(FfxFloat32 x) +{ + return floatBitsToUint(x); +} + +/// Interprets the bit pattern of x as an unsigned integer. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as an unsigned integer. +/// +/// @ingroup GLSL +FfxUInt32x2 ffxAsUInt32(FfxFloat32x2 x) +{ + return floatBitsToUint(x); +} + +/// Interprets the bit pattern of x as an unsigned integer. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as an unsigned integer. +/// +/// @ingroup GLSL +FfxUInt32x3 ffxAsUInt32(FfxFloat32x3 x) +{ + return floatBitsToUint(x); +} + +/// Interprets the bit pattern of x as an unsigned integer. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as an unsigned integer. +/// +/// @ingroup GLSL +FfxUInt32x4 ffxAsUInt32(FfxFloat32x4 x) +{ + return floatBitsToUint(x); +} + +/// Convert a 32bit IEEE 754 floating point value to its nearest 16bit equivalent. +/// +/// @param [in] value The value to convert. +/// +/// @returns +/// The nearest 16bit equivalent of value. +/// +/// @ingroup GLSL +FfxUInt32 f32tof16(FfxFloat32 value) +{ + return packHalf2x16(FfxFloat32x2(value, 0.0)); +} + +/// Broadcast a scalar value to a 2-dimensional floating point vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 2-dimensional floating point vector with value in each component. +/// +/// @ingroup GLSL +FfxFloat32x2 ffxBroadcast2(FfxFloat32 value) +{ + return FfxFloat32x2(value, value); +} + +/// Broadcast a scalar value to a 3-dimensional floating point vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 3-dimensional floating point vector with value in each component. +/// +/// @ingroup GLSL +FfxFloat32x3 ffxBroadcast3(FfxFloat32 value) +{ + return FfxFloat32x3(value, value, value); +} + +/// Broadcast a scalar value to a 4-dimensional floating point vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 4-dimensional floating point vector with value in each component. +/// +/// @ingroup GLSL +FfxFloat32x4 ffxBroadcast4(FfxFloat32 value) +{ + return FfxFloat32x4(value, value, value, value); +} + +/// Broadcast a scalar value to a 2-dimensional signed integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 2-dimensional signed integer vector with value in each component. +/// +/// @ingroup GLSL +FfxInt32x2 ffxBroadcast2(FfxInt32 value) +{ + return FfxInt32x2(value, value); +} + +/// Broadcast a scalar value to a 3-dimensional signed integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 3-dimensional signed integer vector with value in each component. +/// +/// @ingroup GLSL +FfxInt32x3 ffxBroadcast3(FfxInt32 value) +{ + return FfxInt32x3(value, value, value); +} + +/// Broadcast a scalar value to a 4-dimensional signed integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 4-dimensional signed integer vector with value in each component. +/// +/// @ingroup GLSL +FfxInt32x4 ffxBroadcast4(FfxInt32 value) +{ + return FfxInt32x4(value, value, value, value); +} + +/// Broadcast a scalar value to a 2-dimensional unsigned integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 2-dimensional unsigned integer vector with value in each component. +/// +/// @ingroup GLSL +FfxUInt32x2 ffxBroadcast2(FfxUInt32 value) +{ + return FfxUInt32x2(value, value); +} + +/// Broadcast a scalar value to a 3-dimensional unsigned integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 3-dimensional unsigned integer vector with value in each component. +/// +/// @ingroup GLSL +FfxUInt32x3 ffxBroadcast3(FfxUInt32 value) +{ + return FfxUInt32x3(value, value, value); +} + +/// Broadcast a scalar value to a 4-dimensional unsigned integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 4-dimensional unsigned integer vector with value in each component. +/// +/// @ingroup GLSL +FfxUInt32x4 ffxBroadcast4(FfxUInt32 value) +{ + return FfxUInt32x4(value, value, value, value); +} + +/// +/// +/// @ingroup GLSL +FfxUInt32 bitfieldExtract(FfxUInt32 src, FfxUInt32 off, FfxUInt32 bits) +{ + return bitfieldExtract(src, FfxInt32(off), FfxInt32(bits)); +} + +/// +/// +/// @ingroup GLSL +FfxUInt32 bitfieldInsert(FfxUInt32 src, FfxUInt32 ins, FfxUInt32 mask) +{ + return (ins & mask) | (src & (~mask)); +} + +// Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup GLSL +FfxFloat32 ffxLerp(FfxFloat32 x, FfxFloat32 y, FfxFloat32 t) +{ + return mix(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the GLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup GLSL +FfxFloat32x2 ffxLerp(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32 t) +{ + return mix(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the GLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup GLSL +FfxFloat32x2 ffxLerp(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 t) +{ + return mix(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the GLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup GLSL +FfxFloat32x3 ffxLerp(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32 t) +{ + return mix(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the GLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup GLSL +FfxFloat32x3 ffxLerp(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 t) +{ + return mix(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the GLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup GLSL +FfxFloat32x4 ffxLerp(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32 t) +{ + return mix(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the GLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup GLSL +FfxFloat32x4 ffxLerp(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 t) +{ + return mix(x, y, t); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on +/// GCN or RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup GLSL +FfxFloat32 ffxMax3(FfxFloat32 x, FfxFloat32 y, FfxFloat32 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on +/// GCN or RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup GLSL +FfxFloat32x2 ffxMax3(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on +/// GCN or RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup GLSL +FfxFloat32x3 ffxMax3(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on +/// GCN or RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup GLSL +FfxFloat32x4 ffxMax3(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on +/// GCN or RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup GLSL +FfxUInt32 ffxMax3(FfxUInt32 x, FfxUInt32 y, FfxUInt32 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on +/// GCN or RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup GLSL +FfxUInt32x2 ffxMax3(FfxUInt32x2 x, FfxUInt32x2 y, FfxUInt32x2 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup GLSL +FfxUInt32x3 ffxMax3(FfxUInt32x3 x, FfxUInt32x3 y, FfxUInt32x3 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup GLSL +FfxUInt32x4 ffxMax3(FfxUInt32x4 x, FfxUInt32x4 y, FfxUInt32x4 z) +{ + return max(x, max(y, z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup GLSL +FfxFloat32 ffxMed3(FfxFloat32 x, FfxFloat32 y, FfxFloat32 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup GLSL +FfxFloat32x2 ffxMed3(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup GLSL +FfxFloat32x3 ffxMed3(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup GLSL +FfxFloat32x4 ffxMed3(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_I32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup GLSL +FfxInt32 ffxMed3(FfxInt32 x, FfxInt32 y, FfxInt32 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_I32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup GLSL +FfxInt32x2 ffxMed3(FfxInt32x2 x, FfxInt32x2 y, FfxInt32x2 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_I32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup GLSL +FfxInt32x3 ffxMed3(FfxInt32x3 x, FfxInt32x3 y, FfxInt32x3 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_I32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup GLSL +FfxInt32x4 ffxMed3(FfxInt32x4 x, FfxInt32x4 y, FfxInt32x4 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on +/// GCN and RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup GLSL +FfxFloat32 ffxMin3(FfxFloat32 x, FfxFloat32 y, FfxFloat32 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup GLSL +FfxFloat32x2 ffxMin3(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup GLSL +FfxFloat32x3 ffxMin3(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup GLSL +FfxFloat32x4 ffxMin3(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup GLSL +FfxUInt32 ffxMin3(FfxUInt32 x, FfxUInt32 y, FfxUInt32 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup GLSL +FfxUInt32x2 ffxMin3(FfxUInt32x2 x, FfxUInt32x2 y, FfxUInt32x2 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup GLSL +FfxUInt32x3 ffxMin3(FfxUInt32x3 x, FfxUInt32x3 y, FfxUInt32x3 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on +/// GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup GLSL +FfxUInt32x4 ffxMin3(FfxUInt32x4 x, FfxUInt32x4 y, FfxUInt32x4 z) +{ + return min(x, min(y, z)); +} + +/// Compute the reciprocal of a value. +/// +/// NOTE: This function is only provided for GLSL. In HLSL the intrinsic function rcp can be used. +/// +/// @param [in] x The value to compute the reciprocal for. +/// +/// @returns +/// The reciprocal value of x. +/// +/// @ingroup GLSL +FfxFloat32 rcp(FfxFloat32 x) +{ + return FfxFloat32(1.0) / x; +} + +/// Compute the reciprocal of a value. +/// +/// NOTE: This function is only provided for GLSL. In HLSL the intrinsic function rcp can be used. +/// +/// @param [in] x The value to compute the reciprocal for. +/// +/// @returns +/// The reciprocal value of x. +/// +/// @ingroup GLSL +FfxFloat32x2 rcp(FfxFloat32x2 x) +{ + return ffxBroadcast2(1.0) / x; +} + +/// Compute the reciprocal of a value. +/// +/// NOTE: This function is only provided for GLSL. In HLSL the intrinsic function rcp can be used. +/// +/// @param [in] x The value to compute the reciprocal for. +/// +/// @returns +/// The reciprocal value of x. +/// +/// @ingroup GLSL +FfxFloat32x3 rcp(FfxFloat32x3 x) +{ + return ffxBroadcast3(1.0) / x; +} + +/// Compute the reciprocal of a value. +/// +/// NOTE: This function is only provided for GLSL. In HLSL the intrinsic function rcp can be used. +/// +/// @param [in] x The value to compute the reciprocal for. +/// +/// @returns +/// The reciprocal value of x. +/// +/// @ingroup GLSL +FfxFloat32x4 rcp(FfxFloat32x4 x) +{ + return ffxBroadcast4(1.0) / x; +} + +/// Compute the reciprocal square root of a value. +/// +/// NOTE: This function is only provided for GLSL. In HLSL the intrinsic function rsqrt can be used. +/// +/// @param [in] x The value to compute the reciprocal for. +/// +/// @returns +/// The reciprocal square root value of x. +/// +/// @ingroup GLSL +FfxFloat32 rsqrt(FfxFloat32 x) +{ + return FfxFloat32(1.0) / ffxSqrt(x); +} + +/// Compute the reciprocal square root of a value. +/// +/// NOTE: This function is only provided for GLSL. In HLSL the intrinsic function rsqrt can be used. +/// +/// @param [in] x The value to compute the reciprocal for. +/// +/// @returns +/// The reciprocal square root value of x. +/// +/// @ingroup GLSL +FfxFloat32x2 rsqrt(FfxFloat32x2 x) +{ + return ffxBroadcast2(1.0) / ffxSqrt(x); +} + +/// Compute the reciprocal square root of a value. +/// +/// NOTE: This function is only provided for GLSL. In HLSL the intrinsic function rsqrt can be used. +/// +/// @param [in] x The value to compute the reciprocal for. +/// +/// @returns +/// The reciprocal square root value of x. +/// +/// @ingroup GLSL +FfxFloat32x3 rsqrt(FfxFloat32x3 x) +{ + return ffxBroadcast3(1.0) / ffxSqrt(x); +} + +/// Compute the reciprocal square root of a value. +/// +/// NOTE: This function is only provided for GLSL. In HLSL the intrinsic function rsqrt can be used. +/// +/// @param [in] x The value to compute the reciprocal for. +/// +/// @returns +/// The reciprocal square root value of x. +/// +/// @ingroup GLSL +FfxFloat32x4 rsqrt(FfxFloat32x4 x) +{ + return ffxBroadcast4(1.0) / ffxSqrt(x); +} + +/// Clamp a value to a [0..1] range. +/// +/// @param [in] x The value to clamp to [0..1] range. +/// +/// @returns +/// The clamped version of x. +/// +/// @ingroup GLSL +FfxFloat32 ffxSaturate(FfxFloat32 x) +{ + return clamp(x, FfxFloat32(0.0), FfxFloat32(1.0)); +} + +/// Clamp a value to a [0..1] range. +/// +/// @param [in] x The value to clamp to [0..1] range. +/// +/// @returns +/// The clamped version of x. +/// +/// @ingroup GLSL +FfxFloat32x2 ffxSaturate(FfxFloat32x2 x) +{ + return clamp(x, ffxBroadcast2(0.0), ffxBroadcast2(1.0)); +} + +/// Clamp a value to a [0..1] range. +/// +/// @param [in] x The value to clamp to [0..1] range. +/// +/// @returns +/// The clamped version of x. +/// +/// @ingroup GLSL +FfxFloat32x3 ffxSaturate(FfxFloat32x3 x) +{ + return clamp(x, ffxBroadcast3(0.0), ffxBroadcast3(1.0)); +} + +/// Clamp a value to a [0..1] range. +/// +/// @param [in] x The value to clamp to [0..1] range. +/// +/// @returns +/// The clamped version of x. +/// +/// @ingroup GLSL +FfxFloat32x4 ffxSaturate(FfxFloat32x4 x) +{ + return clamp(x, ffxBroadcast4(0.0), ffxBroadcast4(1.0)); +} + +/// Compute the factional part of a decimal value. +/// +/// This function calculates x - floor(x). Where floor is the intrinsic HLSL function. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. It is +/// worth further noting that this function is intentionally distinct from the HLSL frac intrinsic +/// function. +/// +/// @param [in] x The value to compute the fractional part from. +/// +/// @returns +/// The fractional part of x. +/// +/// @ingroup HLSL +FfxFloat32 ffxFract(FfxFloat32 x) +{ + return fract(x); +} + +/// Compute the factional part of a decimal value. +/// +/// This function calculates x - floor(x). Where floor is the intrinsic HLSL function. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. It is +/// worth further noting that this function is intentionally distinct from the HLSL frac intrinsic +/// function. +/// +/// @param [in] x The value to compute the fractional part from. +/// +/// @returns +/// The fractional part of x. +/// +/// @ingroup HLSL +FfxFloat32x2 ffxFract(FfxFloat32x2 x) +{ + return fract(x); +} + +/// Compute the factional part of a decimal value. +/// +/// This function calculates x - floor(x). Where floor is the intrinsic HLSL function. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. It is +/// worth further noting that this function is intentionally distinct from the HLSL frac intrinsic +/// function. +/// +/// @param [in] x The value to compute the fractional part from. +/// +/// @returns +/// The fractional part of x. +/// +/// @ingroup HLSL +FfxFloat32x3 ffxFract(FfxFloat32x3 x) +{ + return fract(x); +} + +/// Compute the factional part of a decimal value. +/// +/// This function calculates x - floor(x). Where floor is the intrinsic HLSL function. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. It is +/// worth further noting that this function is intentionally distinct from the HLSL frac intrinsic +/// function. +/// +/// @param [in] x The value to compute the fractional part from. +/// +/// @returns +/// The fractional part of x. +/// +/// @ingroup HLSL +FfxFloat32x4 ffxFract(FfxFloat32x4 x) +{ + return fract(x); +} + +FfxUInt32 AShrSU1(FfxUInt32 a, FfxUInt32 b) +{ + return FfxUInt32(FfxInt32(a) >> FfxInt32(b)); +} + +#if FFX_HALF + +#define FFX_UINT32_TO_FLOAT16X2(x) unpackFloat2x16(FfxUInt32(x)) + +FfxFloat16x4 ffxUint32x2ToFloat16x4(FfxUInt32x2 x) +{ + return FfxFloat16x4(unpackFloat2x16(x.x), unpackFloat2x16(x.y)); +} +#define FFX_UINT32X2_TO_FLOAT16X4(x) ffxUint32x2ToFloat16x4(FfxUInt32x2(x)) +#define FFX_UINT32_TO_UINT16X2(x) unpackUint2x16(FfxUInt32(x)) +#define FFX_UINT32X2_TO_UINT16X4(x) unpackUint4x16(pack64(FfxUInt32x2(x))) +//------------------------------------------------------------------------------------------------------------------------------ +#define FFX_FLOAT16X2_TO_UINT32(x) packFloat2x16(FfxFloat16x2(x)) +FfxUInt32x2 ffxFloat16x4ToUint32x2(FfxFloat16x4 x) +{ + return FfxUInt32x2(packFloat2x16(x.xy), packFloat2x16(x.zw)); +} +#define FFX_FLOAT16X4_TO_UINT32X2(x) ffxFloat16x4ToUint32x2(FfxFloat16x4(x)) +#define FFX_UINT16X2_TO_UINT32(x) packUint2x16(FfxUInt16x2(x)) +#define FFX_UINT16X4_TO_UINT32X2(x) unpack32(packUint4x16(FfxUInt16x4(x))) +//============================================================================================================================== +#define FFX_TO_UINT16(x) halfBitsToUint16(FfxFloat16(x)) +#define FFX_TO_UINT16X2(x) halfBitsToUint16(FfxFloat16x2(x)) +#define FFX_TO_UINT16X3(x) halfBitsToUint16(FfxFloat16x3(x)) +#define FFX_TO_UINT16X4(x) halfBitsToUint16(FfxFloat16x4(x)) +//------------------------------------------------------------------------------------------------------------------------------ +#define FFX_TO_FLOAT16(x) uint16BitsToHalf(FfxUInt16(x)) +#define FFX_TO_FLOAT16X2(x) uint16BitsToHalf(FfxUInt16x2(x)) +#define FFX_TO_FLOAT16X3(x) uint16BitsToHalf(FfxUInt16x3(x)) +#define FFX_TO_FLOAT16X4(x) uint16BitsToHalf(FfxUInt16x4(x)) +//============================================================================================================================== +FfxFloat16 ffxBroadcastFloat16(FfxFloat16 a) +{ + return FfxFloat16(a); +} +FfxFloat16x2 ffxBroadcastFloat16x2(FfxFloat16 a) +{ + return FfxFloat16x2(a, a); +} +FfxFloat16x3 ffxBroadcastFloat16x3(FfxFloat16 a) +{ + return FfxFloat16x3(a, a, a); +} +FfxFloat16x4 ffxBroadcastFloat16x4(FfxFloat16 a) +{ + return FfxFloat16x4(a, a, a, a); +} +#define FFX_BROADCAST_FLOAT16(a) FfxFloat16(a) +#define FFX_BROADCAST_FLOAT16X2(a) FfxFloat16x2(FfxFloat16(a)) +#define FFX_BROADCAST_FLOAT16X3(a) FfxFloat16x3(FfxFloat16(a)) +#define FFX_BROADCAST_FLOAT16X4(a) FfxFloat16x4(FfxFloat16(a)) +//------------------------------------------------------------------------------------------------------------------------------ +FfxInt16 ffxBroadcastInt16(FfxInt16 a) +{ + return FfxInt16(a); +} +FfxInt16x2 ffxBroadcastInt16x2(FfxInt16 a) +{ + return FfxInt16x2(a, a); +} +FfxInt16x3 ffxBroadcastInt16x3(FfxInt16 a) +{ + return FfxInt16x3(a, a, a); +} +FfxInt16x4 ffxBroadcastInt16x4(FfxInt16 a) +{ + return FfxInt16x4(a, a, a, a); +} +#define FFX_BROADCAST_INT16(a) FfxInt16(a) +#define FFX_BROADCAST_INT16X2(a) FfxInt16x2(FfxInt16(a)) +#define FFX_BROADCAST_INT16X3(a) FfxInt16x3(FfxInt16(a)) +#define FFX_BROADCAST_INT16X4(a) FfxInt16x4(FfxInt16(a)) +//------------------------------------------------------------------------------------------------------------------------------ +FfxUInt16 ffxBroadcastUInt16(FfxUInt16 a) +{ + return FfxUInt16(a); +} +FfxUInt16x2 ffxBroadcastUInt16x2(FfxUInt16 a) +{ + return FfxUInt16x2(a, a); +} +FfxUInt16x3 ffxBroadcastUInt16x3(FfxUInt16 a) +{ + return FfxUInt16x3(a, a, a); +} +FfxUInt16x4 ffxBroadcastUInt16x4(FfxUInt16 a) +{ + return FfxUInt16x4(a, a, a, a); +} +#define FFX_BROADCAST_UINT16(a) FfxUInt16(a) +#define FFX_BROADCAST_UINT16X2(a) FfxUInt16x2(FfxUInt16(a)) +#define FFX_BROADCAST_UINT16X3(a) FfxUInt16x3(FfxUInt16(a)) +#define FFX_BROADCAST_UINT16X4(a) FfxUInt16x4(FfxUInt16(a)) +//============================================================================================================================== +FfxUInt16 ffxAbsHalf(FfxUInt16 a) +{ + return FfxUInt16(abs(FfxInt16(a))); +} +FfxUInt16x2 ffxAbsHalf(FfxUInt16x2 a) +{ + return FfxUInt16x2(abs(FfxInt16x2(a))); +} +FfxUInt16x3 ffxAbsHalf(FfxUInt16x3 a) +{ + return FfxUInt16x3(abs(FfxInt16x3(a))); +} +FfxUInt16x4 ffxAbsHalf(FfxUInt16x4 a) +{ + return FfxUInt16x4(abs(FfxInt16x4(a))); +} +//------------------------------------------------------------------------------------------------------------------------------ +FfxFloat16 ffxClampHalf(FfxFloat16 x, FfxFloat16 n, FfxFloat16 m) +{ + return clamp(x, n, m); +} +FfxFloat16x2 ffxClampHalf(FfxFloat16x2 x, FfxFloat16x2 n, FfxFloat16x2 m) +{ + return clamp(x, n, m); +} +FfxFloat16x3 ffxClampHalf(FfxFloat16x3 x, FfxFloat16x3 n, FfxFloat16x3 m) +{ + return clamp(x, n, m); +} +FfxFloat16x4 ffxClampHalf(FfxFloat16x4 x, FfxFloat16x4 n, FfxFloat16x4 m) +{ + return clamp(x, n, m); +} +//------------------------------------------------------------------------------------------------------------------------------ +FfxFloat16 ffxFract(FfxFloat16 x) +{ + return fract(x); +} +FfxFloat16x2 ffxFract(FfxFloat16x2 x) +{ + return fract(x); +} +FfxFloat16x3 ffxFract(FfxFloat16x3 x) +{ + return fract(x); +} +FfxFloat16x4 ffxFract(FfxFloat16x4 x) +{ + return fract(x); +} +//------------------------------------------------------------------------------------------------------------------------------ +FfxFloat16 ffxLerp(FfxFloat16 x, FfxFloat16 y, FfxFloat16 a) +{ + return mix(x, y, a); +} +FfxFloat16x2 ffxLerp(FfxFloat16x2 x, FfxFloat16x2 y, FfxFloat16 a) +{ + return mix(x, y, a); +} +FfxFloat16x2 ffxLerp(FfxFloat16x2 x, FfxFloat16x2 y, FfxFloat16x2 a) +{ + return mix(x, y, a); +} +FfxFloat16x3 ffxLerp(FfxFloat16x3 x, FfxFloat16x3 y, FfxFloat16x3 a) +{ + return mix(x, y, a); +} +FfxFloat16x3 ffxLerp(FfxFloat16x3 x, FfxFloat16x3 y, FfxFloat16 a) +{ + return mix(x, y, a); +} +FfxFloat16x4 ffxLerp(FfxFloat16x4 x, FfxFloat16x4 y, FfxFloat16 a) +{ + return mix(x, y, a); +} +FfxFloat16x4 ffxLerp(FfxFloat16x4 x, FfxFloat16x4 y, FfxFloat16x4 a) +{ + return mix(x, y, a); +} +//------------------------------------------------------------------------------------------------------------------------------ +// No packed version of ffxMid3. +FfxFloat16 ffxMed3Half(FfxFloat16 x, FfxFloat16 y, FfxFloat16 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FfxFloat16x2 ffxMed3Half(FfxFloat16x2 x, FfxFloat16x2 y, FfxFloat16x2 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FfxFloat16x3 ffxMed3Half(FfxFloat16x3 x, FfxFloat16x3 y, FfxFloat16x3 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FfxFloat16x4 ffxMed3Half(FfxFloat16x4 x, FfxFloat16x4 y, FfxFloat16x4 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FfxInt16 ffxMed3Half(FfxInt16 x, FfxInt16 y, FfxInt16 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FfxInt16x2 ffxMed3Half(FfxInt16x2 x, FfxInt16x2 y, FfxInt16x2 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FfxInt16x3 ffxMed3Half(FfxInt16x3 x, FfxInt16x3 y, FfxInt16x3 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FfxInt16x4 ffxMed3Half(FfxInt16x4 x, FfxInt16x4 y, FfxInt16x4 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +//------------------------------------------------------------------------------------------------------------------------------ +// No packed version of ffxMax3. +FfxFloat16 ffxMax3Half(FfxFloat16 x, FfxFloat16 y, FfxFloat16 z) +{ + return max(x, max(y, z)); +} +FfxFloat16x2 ffxMax3Half(FfxFloat16x2 x, FfxFloat16x2 y, FfxFloat16x2 z) +{ + return max(x, max(y, z)); +} +FfxFloat16x3 ffxMax3Half(FfxFloat16x3 x, FfxFloat16x3 y, FfxFloat16x3 z) +{ + return max(x, max(y, z)); +} +FfxFloat16x4 ffxMax3Half(FfxFloat16x4 x, FfxFloat16x4 y, FfxFloat16x4 z) +{ + return max(x, max(y, z)); +} +//------------------------------------------------------------------------------------------------------------------------------ +// No packed version of ffxMin3. +FfxFloat16 ffxMin3Half(FfxFloat16 x, FfxFloat16 y, FfxFloat16 z) +{ + return min(x, min(y, z)); +} +FfxFloat16x2 ffxMin3Half(FfxFloat16x2 x, FfxFloat16x2 y, FfxFloat16x2 z) +{ + return min(x, min(y, z)); +} +FfxFloat16x3 ffxMin3Half(FfxFloat16x3 x, FfxFloat16x3 y, FfxFloat16x3 z) +{ + return min(x, min(y, z)); +} +FfxFloat16x4 ffxMin3Half(FfxFloat16x4 x, FfxFloat16x4 y, FfxFloat16x4 z) +{ + return min(x, min(y, z)); +} +//------------------------------------------------------------------------------------------------------------------------------ +FfxFloat16 ffxReciprocalHalf(FfxFloat16 x) +{ + return FFX_BROADCAST_FLOAT16(1.0) / x; +} +FfxFloat16x2 ffxReciprocalHalf(FfxFloat16x2 x) +{ + return FFX_BROADCAST_FLOAT16X2(1.0) / x; +} +FfxFloat16x3 ffxReciprocalHalf(FfxFloat16x3 x) +{ + return FFX_BROADCAST_FLOAT16X3(1.0) / x; +} +FfxFloat16x4 ffxReciprocalHalf(FfxFloat16x4 x) +{ + return FFX_BROADCAST_FLOAT16X4(1.0) / x; +} +//------------------------------------------------------------------------------------------------------------------------------ +FfxFloat16 ffxReciprocalSquareRootHalf(FfxFloat16 x) +{ + return FFX_BROADCAST_FLOAT16(1.0) / sqrt(x); +} +FfxFloat16x2 ffxReciprocalSquareRootHalf(FfxFloat16x2 x) +{ + return FFX_BROADCAST_FLOAT16X2(1.0) / sqrt(x); +} +FfxFloat16x3 ffxReciprocalSquareRootHalf(FfxFloat16x3 x) +{ + return FFX_BROADCAST_FLOAT16X3(1.0) / sqrt(x); +} +FfxFloat16x4 ffxReciprocalSquareRootHalf(FfxFloat16x4 x) +{ + return FFX_BROADCAST_FLOAT16X4(1.0) / sqrt(x); +} +//------------------------------------------------------------------------------------------------------------------------------ +FfxFloat16 ffxSaturate(FfxFloat16 x) +{ + return clamp(x, FFX_BROADCAST_FLOAT16(0.0), FFX_BROADCAST_FLOAT16(1.0)); +} +FfxFloat16x2 ffxSaturate(FfxFloat16x2 x) +{ + return clamp(x, FFX_BROADCAST_FLOAT16X2(0.0), FFX_BROADCAST_FLOAT16X2(1.0)); +} +FfxFloat16x3 ffxSaturate(FfxFloat16x3 x) +{ + return clamp(x, FFX_BROADCAST_FLOAT16X3(0.0), FFX_BROADCAST_FLOAT16X3(1.0)); +} +FfxFloat16x4 ffxSaturate(FfxFloat16x4 x) +{ + return clamp(x, FFX_BROADCAST_FLOAT16X4(0.0), FFX_BROADCAST_FLOAT16X4(1.0)); +} +//------------------------------------------------------------------------------------------------------------------------------ +FfxUInt16 ffxBitShiftRightHalf(FfxUInt16 a, FfxUInt16 b) +{ + return FfxUInt16(FfxInt16(a) >> FfxInt16(b)); +} +FfxUInt16x2 ffxBitShiftRightHalf(FfxUInt16x2 a, FfxUInt16x2 b) +{ + return FfxUInt16x2(FfxInt16x2(a) >> FfxInt16x2(b)); +} +FfxUInt16x3 ffxBitShiftRightHalf(FfxUInt16x3 a, FfxUInt16x3 b) +{ + return FfxUInt16x3(FfxInt16x3(a) >> FfxInt16x3(b)); +} +FfxUInt16x4 ffxBitShiftRightHalf(FfxUInt16x4 a, FfxUInt16x4 b) +{ + return FfxUInt16x4(FfxInt16x4(a) >> FfxInt16x4(b)); +} +#endif // FFX_HALF + +#if defined(FFX_WAVE) +// Where 'x' must be a compile time literal. +FfxFloat32 AWaveXorF1(FfxFloat32 v, FfxUInt32 x) +{ + return subgroupShuffleXor(v, x); +} +FfxFloat32x2 AWaveXorF2(FfxFloat32x2 v, FfxUInt32 x) +{ + return subgroupShuffleXor(v, x); +} +FfxFloat32x3 AWaveXorF3(FfxFloat32x3 v, FfxUInt32 x) +{ + return subgroupShuffleXor(v, x); +} +FfxFloat32x4 AWaveXorF4(FfxFloat32x4 v, FfxUInt32 x) +{ + return subgroupShuffleXor(v, x); +} +FfxUInt32 AWaveXorU1(FfxUInt32 v, FfxUInt32 x) +{ + return subgroupShuffleXor(v, x); +} +FfxUInt32x2 AWaveXorU2(FfxUInt32x2 v, FfxUInt32 x) +{ + return subgroupShuffleXor(v, x); +} +FfxUInt32x3 AWaveXorU3(FfxUInt32x3 v, FfxUInt32 x) +{ + return subgroupShuffleXor(v, x); +} +FfxUInt32x4 AWaveXorU4(FfxUInt32x4 v, FfxUInt32 x) +{ + return subgroupShuffleXor(v, x); +} + +//------------------------------------------------------------------------------------------------------------------------------ +#if FFX_HALF +FfxFloat16x2 ffxWaveXorFloat16x2(FfxFloat16x2 v, FfxUInt32 x) +{ + return FFX_UINT32_TO_FLOAT16X2(subgroupShuffleXor(FFX_FLOAT16X2_TO_UINT32(v), x)); +} +FfxFloat16x4 ffxWaveXorFloat16x4(FfxFloat16x4 v, FfxUInt32 x) +{ + return FFX_UINT32X2_TO_FLOAT16X4(subgroupShuffleXor(FFX_FLOAT16X4_TO_UINT32X2(v), x)); +} +FfxUInt16x2 ffxWaveXorUint16x2(FfxUInt16x2 v, FfxUInt32 x) +{ + return FFX_UINT32_TO_UINT16X2(subgroupShuffleXor(FFX_UINT16X2_TO_UINT32(v), x)); +} +FfxUInt16x4 ffxWaveXorUint16x4(FfxUInt16x4 v, FfxUInt32 x) +{ + return FFX_UINT32X2_TO_UINT16X4(subgroupShuffleXor(FFX_UINT16X4_TO_UINT32X2(v), x)); +} +#endif // FFX_HALF +#endif // #if defined(FFX_WAVE) diff --git a/thirdparty/amd-fsr2/shaders/ffx_core_gpu_common.h b/thirdparty/amd-fsr2/shaders/ffx_core_gpu_common.h new file mode 100644 index 00000000000..ae07642f0df --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_core_gpu_common.h @@ -0,0 +1,2784 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +/// A define for a true value in a boolean expression. +/// +/// @ingroup GPU +#define FFX_TRUE (true) + +/// A define for a false value in a boolean expression. +/// +/// @ingroup GPU +#define FFX_FALSE (false) + +/// A define value for positive infinity. +/// +/// @ingroup GPU +#define FFX_POSITIVE_INFINITY_FLOAT ffxAsFloat(0x7f800000u) + +/// A define value for negative infinity. +/// +/// @ingroup GPU +#define FFX_NEGATIVE_INFINITY_FLOAT ffxAsFloat(0xff800000u) + +/// A define value for PI. +/// +/// @ingroup GPU +#define FFX_PI (3.14159) + + +/// Compute the reciprocal of value. +/// +/// @param [in] value The value to compute the reciprocal of. +/// +/// @returns +/// The 1 / value. +/// +/// @ingroup GPU +FfxFloat32 ffxReciprocal(FfxFloat32 value) +{ + return rcp(value); +} + +/// Compute the reciprocal of value. +/// +/// @param [in] value The value to compute the reciprocal of. +/// +/// @returns +/// The 1 / value. +/// +/// @ingroup GPU +FfxFloat32x2 ffxReciprocal(FfxFloat32x2 value) +{ + return rcp(value); +} + +/// Compute the reciprocal of value. +/// +/// @param [in] value The value to compute the reciprocal of. +/// +/// @returns +/// The 1 / value. +/// +/// @ingroup GPU +FfxFloat32x3 ffxReciprocal(FfxFloat32x3 value) +{ + return rcp(value); +} + +/// Compute the reciprocal of value. +/// +/// @param [in] value The value to compute the reciprocal of. +/// +/// @returns +/// The 1 / value. +/// +/// @ingroup GPU +FfxFloat32x4 ffxReciprocal(FfxFloat32x4 value) +{ + return rcp(value); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat32 ffxMin(FfxFloat32 x, FfxFloat32 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat32x2 ffxMin(FfxFloat32x2 x, FfxFloat32x2 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat32x3 ffxMin(FfxFloat32x3 x, FfxFloat32x3 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat32x4 ffxMin(FfxFloat32x4 x, FfxFloat32x4 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt32 ffxMin(FfxInt32 x, FfxInt32 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt32x2 ffxMin(FfxInt32x2 x, FfxInt32x2 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt32x3 ffxMin(FfxInt32x3 x, FfxInt32x3 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt32x4 ffxMin(FfxInt32x4 x, FfxInt32x4 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt32 ffxMin(FfxUInt32 x, FfxUInt32 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt32x2 ffxMin(FfxUInt32x2 x, FfxUInt32x2 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt32x3 ffxMin(FfxUInt32x3 x, FfxUInt32x3 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt32x4 ffxMin(FfxUInt32x4 x, FfxUInt32x4 y) +{ + return min(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat32 ffxMax(FfxFloat32 x, FfxFloat32 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat32x2 ffxMax(FfxFloat32x2 x, FfxFloat32x2 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat32x3 ffxMax(FfxFloat32x3 x, FfxFloat32x3 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat32x4 ffxMax(FfxFloat32x4 x, FfxFloat32x4 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt32 ffxMax(FfxInt32 x, FfxInt32 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt32x2 ffxMax(FfxInt32x2 x, FfxInt32x2 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt32x3 ffxMax(FfxInt32x3 x, FfxInt32x3 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt32x4 ffxMax(FfxInt32x4 x, FfxInt32x4 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt32 ffxMax(FfxUInt32 x, FfxUInt32 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt32x2 ffxMax(FfxUInt32x2 x, FfxUInt32x2 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt32x3 ffxMax(FfxUInt32x3 x, FfxUInt32x3 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt32x4 ffxMax(FfxUInt32x4 x, FfxUInt32x4 y) +{ + return max(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPU +FfxFloat32 ffxPow(FfxFloat32 x, FfxFloat32 y) +{ + return pow(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPU +FfxFloat32x2 ffxPow(FfxFloat32x2 x, FfxFloat32x2 y) +{ + return pow(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPU +FfxFloat32x3 ffxPow(FfxFloat32x3 x, FfxFloat32x3 y) +{ + return pow(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPU +FfxFloat32x4 ffxPow(FfxFloat32x4 x, FfxFloat32x4 y) +{ + return pow(x, y); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPU +FfxFloat32 ffxSqrt(FfxFloat32 x) +{ + return sqrt(x); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPU +FfxFloat32x2 ffxSqrt(FfxFloat32x2 x) +{ + return sqrt(x); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPU +FfxFloat32x3 ffxSqrt(FfxFloat32x3 x) +{ + return sqrt(x); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPU +FfxFloat32x4 ffxSqrt(FfxFloat32x4 x) +{ + return sqrt(x); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPU +FfxFloat32 ffxCopySignBit(FfxFloat32 d, FfxFloat32 s) +{ + return ffxAsFloat(ffxAsUInt32(d) | (ffxAsUInt32(s) & FfxUInt32(0x80000000u))); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPU +FfxFloat32x2 ffxCopySignBit(FfxFloat32x2 d, FfxFloat32x2 s) +{ + return ffxAsFloat(ffxAsUInt32(d) | (ffxAsUInt32(s) & ffxBroadcast2(0x80000000u))); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPU +FfxFloat32x3 ffxCopySignBit(FfxFloat32x3 d, FfxFloat32x3 s) +{ + return ffxAsFloat(ffxAsUInt32(d) | (ffxAsUInt32(s) & ffxBroadcast3(0x80000000u))); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPU +FfxFloat32x4 ffxCopySignBit(FfxFloat32x4 d, FfxFloat32x4 s) +{ + return ffxAsFloat(ffxAsUInt32(d) | (ffxAsUInt32(s) & ffxBroadcast4(0x80000000u))); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against 0. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or position. +/// +/// @ingroup GPU +FfxFloat32 ffxIsSigned(FfxFloat32 m) +{ + return ffxSaturate(m * FfxFloat32(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against 0. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or position. +/// +/// @ingroup GPU +FfxFloat32x2 ffxIsSigned(FfxFloat32x2 m) +{ + return ffxSaturate(m * ffxBroadcast2(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against 0. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or position. +/// +/// @ingroup GPU +FfxFloat32x3 ffxIsSigned(FfxFloat32x3 m) +{ + return ffxSaturate(m * ffxBroadcast3(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against for have the sign set. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or positive. +/// +/// @ingroup GPU +FfxFloat32x4 ffxIsSigned(FfxFloat32x4 m) +{ + return ffxSaturate(m * ffxBroadcast4(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPU +FfxFloat32 ffxIsGreaterThanZero(FfxFloat32 m) +{ + return ffxSaturate(m * FfxFloat32(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPU +FfxFloat32x2 ffxIsGreaterThanZero(FfxFloat32x2 m) +{ + return ffxSaturate(m * ffxBroadcast2(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPU +FfxFloat32x3 ffxIsGreaterThanZero(FfxFloat32x3 m) +{ + return ffxSaturate(m * ffxBroadcast3(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPU +FfxFloat32x4 ffxIsGreaterThanZero(FfxFloat32x4 m) +{ + return ffxSaturate(m * ffxBroadcast4(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// Convert a 32bit floating point value to sortable integer. +/// +/// - If sign bit=0, flip the sign bit (positives). +/// - If sign bit=1, flip all bits (negatives). +/// +/// The function has the side effects that: +/// - Larger integers are more positive values. +/// - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage). +/// +/// @param [in] value The floating point value to make sortable. +/// +/// @returns +/// The sortable integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxFloatToSortableInteger(FfxUInt32 value) +{ + return value ^ ((AShrSU1(value, FfxUInt32(31))) | FfxUInt32(0x80000000)); +} + +/// Convert a sortable integer to a 32bit floating point value. +/// +/// The function has the side effects that: +/// - If sign bit=1, flip the sign bit (positives). +/// - If sign bit=0, flip all bits (negatives). +/// +/// @param [in] value The floating point value to make sortable. +/// +/// @returns +/// The sortable integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxSortableIntegerToFloat(FfxUInt32 value) +{ + return value ^ ((~AShrSU1(value, FfxUInt32(31))) | FfxUInt32(0x80000000)); +} + +/// Calculate a low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat32 ffxApproximateSqrt(FfxFloat32 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> FfxUInt32(1)) + FfxUInt32(0x1fbc4639)); +} + +/// Calculate a low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat32 ffxApproximateReciprocal(FfxFloat32 a) +{ + return ffxAsFloat(FfxUInt32(0x7ef07ebb) - ffxAsUInt32(a)); +} + +/// Calculate a medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPU +FfxFloat32 ffxApproximateReciprocalMedium(FfxFloat32 value) +{ + FfxFloat32 b = ffxAsFloat(FfxUInt32(0x7ef19fff) - ffxAsUInt32(value)); + return b * (-b * value + FfxFloat32(2.0)); +} + +/// Calculate a low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal square root for. +/// +/// @returns +/// An approximation of the reciprocal square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat32 ffxApproximateReciprocalSquareRoot(FfxFloat32 a) +{ + return ffxAsFloat(FfxUInt32(0x5f347d74) - (ffxAsUInt32(a) >> FfxUInt32(1))); +} + +/// Calculate a low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat32x2 ffxApproximateSqrt(FfxFloat32x2 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast2(1u)) + ffxBroadcast2(0x1fbc4639u)); +} + +/// Calculate a low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat32x2 ffxApproximateReciprocal(FfxFloat32x2 a) +{ + return ffxAsFloat(ffxBroadcast2(0x7ef07ebbu) - ffxAsUInt32(a)); +} + +/// Calculate a medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPU +FfxFloat32x2 ffxApproximateReciprocalMedium(FfxFloat32x2 a) +{ + FfxFloat32x2 b = ffxAsFloat(ffxBroadcast2(0x7ef19fffu) - ffxAsUInt32(a)); + return b * (-b * a + ffxBroadcast2(2.0f)); +} + +/// Calculate a low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat32x2 ffxApproximateReciprocalSquareRoot(FfxFloat32x2 a) +{ + return ffxAsFloat(ffxBroadcast2(0x5f347d74u) - (ffxAsUInt32(a) >> ffxBroadcast2(1u))); +} + +/// Calculate a low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat32x3 ffxApproximateSqrt(FfxFloat32x3 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast3(1u)) + ffxBroadcast3(0x1fbc4639u)); +} + +/// Calculate a low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat32x3 ffxApproximateReciprocal(FfxFloat32x3 a) +{ + return ffxAsFloat(ffxBroadcast3(0x7ef07ebbu) - ffxAsUInt32(a)); +} + +/// Calculate a medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPU +FfxFloat32x3 ffxApproximateReciprocalMedium(FfxFloat32x3 a) +{ + FfxFloat32x3 b = ffxAsFloat(ffxBroadcast3(0x7ef19fffu) - ffxAsUInt32(a)); + return b * (-b * a + ffxBroadcast3(2.0f)); +} + +/// Calculate a low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat32x3 ffxApproximateReciprocalSquareRoot(FfxFloat32x3 a) +{ + return ffxAsFloat(ffxBroadcast3(0x5f347d74u) - (ffxAsUInt32(a) >> ffxBroadcast3(1u))); +} + +/// Calculate a low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat32x4 ffxApproximateSqrt(FfxFloat32x4 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast4(1u)) + ffxBroadcast4(0x1fbc4639u)); +} + +/// Calculate a low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat32x4 ffxApproximateReciprocal(FfxFloat32x4 a) +{ + return ffxAsFloat(ffxBroadcast4(0x7ef07ebbu) - ffxAsUInt32(a)); +} + +/// Calculate a medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPU +FfxFloat32x4 ffxApproximateReciprocalMedium(FfxFloat32x4 a) +{ + FfxFloat32x4 b = ffxAsFloat(ffxBroadcast4(0x7ef19fffu) - ffxAsUInt32(a)); + return b * (-b * a + ffxBroadcast4(2.0f)); +} + +/// Calculate a low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat32x4 ffxApproximateReciprocalSquareRoot(FfxFloat32x4 a) +{ + return ffxAsFloat(ffxBroadcast4(0x5f347d74u) - (ffxAsUInt32(a) >> ffxBroadcast4(1u))); +} + +/// Calculate dot product of 'a' and 'b'. +/// +/// @param [in] a First vector input. +/// @param [in] b Second vector input. +/// +/// @returns +/// The value of a dot b. +/// +/// @ingroup GPU +FfxFloat32 ffxDot2(FfxFloat32x2 a, FfxFloat32x2 b) +{ + return dot(a, b); +} + +/// Calculate dot product of 'a' and 'b'. +/// +/// @param [in] a First vector input. +/// @param [in] b Second vector input. +/// +/// @returns +/// The value of a dot b. +/// +/// @ingroup GPU +FfxFloat32 ffxDot3(FfxFloat32x3 a, FfxFloat32x3 b) +{ + return dot(a, b); +} + +/// Calculate dot product of 'a' and 'b'. +/// +/// @param [in] a First vector input. +/// @param [in] b Second vector input. +/// +/// @returns +/// The value of a dot b. +/// +/// @ingroup GPU +FfxFloat32 ffxDot4(FfxFloat32x4 a, FfxFloat32x4 b) +{ + return dot(a, b); +} + + +/// Compute an approximate conversion from PQ to Gamma2 space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and Gamma2. +/// +/// @returns +/// The value a converted into Gamma2. +/// +/// @ingroup GPU +FfxFloat32 ffxApproximatePQToGamma2Medium(FfxFloat32 a) +{ + return a * a * a * a; +} + +/// Compute an approximate conversion from PQ to linear space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and linear. +/// +/// @returns +/// The value a converted into linear. +/// +/// @ingroup GPU +FfxFloat32 ffxApproximatePQToLinear(FfxFloat32 a) +{ + return a * a * a * a * a * a * a * a; +} + +/// Compute an approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32 ffxApproximateGamma2ToPQ(FfxFloat32 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> FfxUInt32(2)) + FfxUInt32(0x2F9A4E46)); +} + +/// Compute a more accurate approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32 ffxApproximateGamma2ToPQMedium(FfxFloat32 a) +{ + FfxFloat32 b = ffxAsFloat((ffxAsUInt32(a) >> FfxUInt32(2)) + FfxUInt32(0x2F9A4E46)); + FfxFloat32 b4 = b * b * b * b; + return b - b * (b4 - a) / (FfxFloat32(4.0) * b4); +} + +/// Compute a high accuracy approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32 ffxApproximateGamma2ToPQHigh(FfxFloat32 a) +{ + return ffxSqrt(ffxSqrt(a)); +} + +/// Compute an approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32 ffxApproximateLinearToPQ(FfxFloat32 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> FfxUInt32(3)) + FfxUInt32(0x378D8723)); +} + +/// Compute a more accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32 ffxApproximateLinearToPQMedium(FfxFloat32 a) +{ + FfxFloat32 b = ffxAsFloat((ffxAsUInt32(a) >> FfxUInt32(3)) + FfxUInt32(0x378D8723)); + FfxFloat32 b8 = b * b * b * b * b * b * b * b; + return b - b * (b8 - a) / (FfxFloat32(8.0) * b8); +} + +/// Compute a very accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32 ffxApproximateLinearToPQHigh(FfxFloat32 a) +{ + return ffxSqrt(ffxSqrt(ffxSqrt(a))); +} + +/// Compute an approximate conversion from PQ to Gamma2 space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and Gamma2. +/// +/// @returns +/// The value a converted into Gamma2. +/// +/// @ingroup GPU +FfxFloat32x2 ffxApproximatePQToGamma2Medium(FfxFloat32x2 a) +{ + return a * a * a * a; +} + +/// Compute an approximate conversion from PQ to linear space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and linear. +/// +/// @returns +/// The value a converted into linear. +/// +/// @ingroup GPU +FfxFloat32x2 ffxApproximatePQToLinear(FfxFloat32x2 a) +{ + return a * a * a * a * a * a * a * a; +} + +/// Compute an approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x2 ffxApproximateGamma2ToPQ(FfxFloat32x2 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast2(2u)) + ffxBroadcast2(0x2F9A4E46u)); +} + +/// Compute a more accurate approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x2 ffxApproximateGamma2ToPQMedium(FfxFloat32x2 a) +{ + FfxFloat32x2 b = ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast2(2u)) + ffxBroadcast2(0x2F9A4E46u)); + FfxFloat32x2 b4 = b * b * b * b; + return b - b * (b4 - a) / (FfxFloat32(4.0) * b4); +} + +/// Compute a high accuracy approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x2 ffxApproximateGamma2ToPQHigh(FfxFloat32x2 a) +{ + return ffxSqrt(ffxSqrt(a)); +} + +/// Compute an approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x2 ffxApproximateLinearToPQ(FfxFloat32x2 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast2(3u)) + ffxBroadcast2(0x378D8723u)); +} + +/// Compute a more accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x2 ffxApproximateLinearToPQMedium(FfxFloat32x2 a) +{ + FfxFloat32x2 b = ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast2(3u)) + ffxBroadcast2(0x378D8723u)); + FfxFloat32x2 b8 = b * b * b * b * b * b * b * b; + return b - b * (b8 - a) / (FfxFloat32(8.0) * b8); +} + +/// Compute a very accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x2 ffxApproximateLinearToPQHigh(FfxFloat32x2 a) +{ + return ffxSqrt(ffxSqrt(ffxSqrt(a))); +} + +/// Compute an approximate conversion from PQ to Gamma2 space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and Gamma2. +/// +/// @returns +/// The value a converted into Gamma2. +/// +/// @ingroup GPU +FfxFloat32x3 ffxApproximatePQToGamma2Medium(FfxFloat32x3 a) +{ + return a * a * a * a; +} + +/// Compute an approximate conversion from PQ to linear space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and linear. +/// +/// @returns +/// The value a converted into linear. +/// +/// @ingroup GPU +FfxFloat32x3 ffxApproximatePQToLinear(FfxFloat32x3 a) +{ + return a * a * a * a * a * a * a * a; +} + +/// Compute an approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x3 ffxApproximateGamma2ToPQ(FfxFloat32x3 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast3(2u)) + ffxBroadcast3(0x2F9A4E46u)); +} + +/// Compute a more accurate approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x3 ffxApproximateGamma2ToPQMedium(FfxFloat32x3 a) +{ + FfxFloat32x3 b = ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast3(2u)) + ffxBroadcast3(0x2F9A4E46u)); + FfxFloat32x3 b4 = b * b * b * b; + return b - b * (b4 - a) / (FfxFloat32(4.0) * b4); +} + +/// Compute a high accuracy approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x3 ffxApproximateGamma2ToPQHigh(FfxFloat32x3 a) +{ + return ffxSqrt(ffxSqrt(a)); +} + +/// Compute an approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x3 ffxApproximateLinearToPQ(FfxFloat32x3 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast3(3u)) + ffxBroadcast3(0x378D8723u)); +} + +/// Compute a more accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x3 ffxApproximateLinearToPQMedium(FfxFloat32x3 a) +{ + FfxFloat32x3 b = ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast3(3u)) + ffxBroadcast3(0x378D8723u)); + FfxFloat32x3 b8 = b * b * b * b * b * b * b * b; + return b - b * (b8 - a) / (FfxFloat32(8.0) * b8); +} + +/// Compute a very accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x3 ffxApproximateLinearToPQHigh(FfxFloat32x3 a) +{ + return ffxSqrt(ffxSqrt(ffxSqrt(a))); +} + +/// Compute an approximate conversion from PQ to Gamma2 space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and Gamma2. +/// +/// @returns +/// The value a converted into Gamma2. +/// +/// @ingroup GPU +FfxFloat32x4 ffxApproximatePQToGamma2Medium(FfxFloat32x4 a) +{ + return a * a * a * a; +} + +/// Compute an approximate conversion from PQ to linear space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and linear. +/// +/// @returns +/// The value a converted into linear. +/// +/// @ingroup GPU +FfxFloat32x4 ffxApproximatePQToLinear(FfxFloat32x4 a) +{ + return a * a * a * a * a * a * a * a; +} + +/// Compute an approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x4 ffxApproximateGamma2ToPQ(FfxFloat32x4 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast4(2u)) + ffxBroadcast4(0x2F9A4E46u)); +} + +/// Compute a more accurate approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x4 ffxApproximateGamma2ToPQMedium(FfxFloat32x4 a) +{ + FfxFloat32x4 b = ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast4(2u)) + ffxBroadcast4(0x2F9A4E46u)); + FfxFloat32x4 b4 = b * b * b * b * b * b * b * b; + return b - b * (b4 - a) / (FfxFloat32(4.0) * b4); +} + +/// Compute a high accuracy approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x4 ffxApproximateGamma2ToPQHigh(FfxFloat32x4 a) +{ + return ffxSqrt(ffxSqrt(a)); +} + +/// Compute an approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x4 ffxApproximateLinearToPQ(FfxFloat32x4 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast4(3u)) + ffxBroadcast4(0x378D8723u)); +} + +/// Compute a more accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x4 ffxApproximateLinearToPQMedium(FfxFloat32x4 a) +{ + FfxFloat32x4 b = ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast4(3u)) + ffxBroadcast4(0x378D8723u)); + FfxFloat32x4 b8 = b * b * b * b * b * b * b * b; + return b - b * (b8 - a) / (FfxFloat32(8.0) * b8); +} + +/// Compute a very accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPU +FfxFloat32x4 ffxApproximateLinearToPQHigh(FfxFloat32x4 a) +{ + return ffxSqrt(ffxSqrt(ffxSqrt(a))); +} + +// An approximation of sine. +// +// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +// is {-1/4 to 1/4} representing {-1 to 1}. +// +// @param [in] value The value to calculate approximate sine for. +// +// @returns +// The approximate sine of value. +FfxFloat32 ffxParabolicSin(FfxFloat32 value) +{ + return value * abs(value) - value; +} + +// An approximation of sine. +// +// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +// is {-1/4 to 1/4} representing {-1 to 1}. +// +// @param [in] value The value to calculate approximate sine for. +// +// @returns +// The approximate sine of value. +FfxFloat32x2 ffxParabolicSin(FfxFloat32x2 x) +{ + return x * abs(x) - x; +} + +// An approximation of cosine. +// +// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +// is {-1/4 to 1/4} representing {-1 to 1}. +// +// @param [in] value The value to calculate approximate cosine for. +// +// @returns +// The approximate cosine of value. +FfxFloat32 ffxParabolicCos(FfxFloat32 x) +{ + x = ffxFract(x * FfxFloat32(0.5) + FfxFloat32(0.75)); + x = x * FfxFloat32(2.0) - FfxFloat32(1.0); + return ffxParabolicSin(x); +} + +// An approximation of cosine. +// +// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +// is {-1/4 to 1/4} representing {-1 to 1}. +// +// @param [in] value The value to calculate approximate cosine for. +// +// @returns +// The approximate cosine of value. +FfxFloat32x2 ffxParabolicCos(FfxFloat32x2 x) +{ + x = ffxFract(x * ffxBroadcast2(0.5f) + ffxBroadcast2(0.75f)); + x = x * ffxBroadcast2(2.0f) - ffxBroadcast2(1.0f); + return ffxParabolicSin(x); +} + +// An approximation of both sine and cosine. +// +// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +// is {-1/4 to 1/4} representing {-1 to 1}. +// +// @param [in] value The value to calculate approximate cosine for. +// +// @returns +// A FfxFloat32x2 containing approximations of both sine and cosine of value. +FfxFloat32x2 ffxParabolicSinCos(FfxFloat32 x) +{ + FfxFloat32 y = ffxFract(x * FfxFloat32(0.5) + FfxFloat32(0.75)); + y = y * FfxFloat32(2.0) - FfxFloat32(1.0); + return ffxParabolicSin(FfxFloat32x2(x, y)); +} + +/// Conditional free logic AND operation using values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPU +FfxUInt32 ffxZeroOneAnd(FfxUInt32 x, FfxUInt32 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPU +FfxUInt32x2 ffxZeroOneAnd(FfxUInt32x2 x, FfxUInt32x2 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPU +FfxUInt32x3 ffxZeroOneAnd(FfxUInt32x3 x, FfxUInt32x3 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPU +FfxUInt32x4 ffxZeroOneAnd(FfxUInt32x4 x, FfxUInt32x4 y) +{ + return min(x, y); +} + +/// Conditional free logic NOT operation using two values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// @param [in] y The second value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPU +FfxUInt32 ffxZeroOneAnd(FfxUInt32 x) +{ + return x ^ FfxUInt32(1); +} + +/// Conditional free logic NOT operation using two values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// @param [in] y The second value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPU +FfxUInt32x2 ffxZeroOneAnd(FfxUInt32x2 x) +{ + return x ^ ffxBroadcast2(1u); +} + +/// Conditional free logic NOT operation using two values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// @param [in] y The second value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPU +FfxUInt32x3 ffxZeroOneAnd(FfxUInt32x3 x) +{ + return x ^ ffxBroadcast3(1u); +} + +/// Conditional free logic NOT operation using two values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// @param [in] y The second value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPU +FfxUInt32x4 ffxZeroOneAnd(FfxUInt32x4 x) +{ + return x ^ ffxBroadcast4(1u); +} + +/// Conditional free logic OR operation using two values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxUInt32 ffxZeroOneOr(FfxUInt32 x, FfxUInt32 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxUInt32x2 ffxZeroOneOr(FfxUInt32x2 x, FfxUInt32x2 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxUInt32x3 ffxZeroOneOr(FfxUInt32x3 x, FfxUInt32x3 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxUInt32x4 ffxZeroOneOr(FfxUInt32x4 x, FfxUInt32x4 y) +{ + return max(x, y); +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxUInt32 ffxZeroOneAndToU1(FfxFloat32 x) +{ + return FfxUInt32(FfxFloat32(1.0) - x); +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxUInt32x2 ffxZeroOneAndToU2(FfxFloat32x2 x) +{ + return FfxUInt32x2(ffxBroadcast2(1.0) - x); +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxUInt32x3 ffxZeroOneAndToU3(FfxFloat32x3 x) +{ + return FfxUInt32x3(ffxBroadcast3(1.0) - x); +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxUInt32x4 ffxZeroOneAndToU4(FfxFloat32x4 x) +{ + return FfxUInt32x4(ffxBroadcast4(1.0) - x); +} + +/// Conditional free logic AND operation using two values followed by a NOT operation +/// using the resulting value and a third value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat32 ffxZeroOneAndOr(FfxFloat32 x, FfxFloat32 y, FfxFloat32 z) +{ + return ffxSaturate(x * y + z); +} + +/// Conditional free logic AND operation using two values followed by a NOT operation +/// using the resulting value and a third value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat32x2 ffxZeroOneAndOr(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 z) +{ + return ffxSaturate(x * y + z); +} + +/// Conditional free logic AND operation using two values followed by a NOT operation +/// using the resulting value and a third value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat32x3 ffxZeroOneAndOr(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 z) +{ + return ffxSaturate(x * y + z); +} + +/// Conditional free logic AND operation using two values followed by a NOT operation +/// using the resulting value and a third value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat32x4 ffxZeroOneAndOr(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 z) +{ + return ffxSaturate(x * y + z); +} + +/// Given a value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPU +FfxFloat32 ffxZeroOneIsGreaterThanZero(FfxFloat32 x) +{ + return ffxSaturate(x * FfxFloat32(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// Given a value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPU +FfxFloat32x2 ffxZeroOneIsGreaterThanZero(FfxFloat32x2 x) +{ + return ffxSaturate(x * ffxBroadcast2(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// Given a value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPU +FfxFloat32x3 ffxZeroOneIsGreaterThanZero(FfxFloat32x3 x) +{ + return ffxSaturate(x * ffxBroadcast3(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// Given a value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPU +FfxFloat32x4 ffxZeroOneIsGreaterThanZero(FfxFloat32x4 x) +{ + return ffxSaturate(x * ffxBroadcast4(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// Conditional free logic signed NOT operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat32 ffxZeroOneAnd(FfxFloat32 x) +{ + return FfxFloat32(1.0) - x; +} + +/// Conditional free logic signed NOT operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat32x2 ffxZeroOneAnd(FfxFloat32x2 x) +{ + return ffxBroadcast2(1.0) - x; +} + +/// Conditional free logic signed NOT operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat32x3 ffxZeroOneAnd(FfxFloat32x3 x) +{ + return ffxBroadcast3(1.0) - x; +} + +/// Conditional free logic signed NOT operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat32x4 ffxZeroOneAnd(FfxFloat32x4 x) +{ + return ffxBroadcast4(1.0) - x; +} + +/// Conditional free logic OR operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxFloat32 ffxZeroOneOr(FfxFloat32 x, FfxFloat32 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxFloat32x2 ffxZeroOneOr(FfxFloat32x2 x, FfxFloat32x2 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxFloat32x3 ffxZeroOneOr(FfxFloat32x3 x, FfxFloat32x3 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxFloat32x4 ffxZeroOneOr(FfxFloat32x4 x, FfxFloat32x4 y) +{ + return max(x, y); +} + +/// Choose between two FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPU +FfxFloat32 ffxZeroOneSelect(FfxFloat32 x, FfxFloat32 y, FfxFloat32 z) +{ + FfxFloat32 r = (-x) * z + z; + return x * y + r; +} + +/// Choose between two FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPU +FfxFloat32x2 ffxZeroOneSelect(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 z) +{ + FfxFloat32x2 r = (-x) * z + z; + return x * y + r; +} + +/// Choose between two FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPU +FfxFloat32x3 ffxZeroOneSelect(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 z) +{ + FfxFloat32x3 r = (-x) * z + z; + return x * y + r; +} + +/// Choose between two FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPU +FfxFloat32x4 ffxZeroOneSelect(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 z) +{ + FfxFloat32x4 r = (-x) * z + z; + return x * y + r; +} + +/// Given a value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPU +FfxFloat32 ffxZeroOneIsSigned(FfxFloat32 x) +{ + return ffxSaturate(x * FfxFloat32(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// Given a value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPU +FfxFloat32x2 ffxZeroOneIsSigned(FfxFloat32x2 x) +{ + return ffxSaturate(x * ffxBroadcast2(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// Given a value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPU +FfxFloat32x3 ffxZeroOneIsSigned(FfxFloat32x3 x) +{ + return ffxSaturate(x * ffxBroadcast3(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// Given a value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPU +FfxFloat32x4 ffxZeroOneIsSigned(FfxFloat32x4 x) +{ + return ffxSaturate(x * ffxBroadcast4(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// Compute a Rec.709 color space. +/// +/// Rec.709 is used for some HDTVs. +/// +/// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +/// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +/// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +/// +/// @param [in] color The color to convert to Rec. 709. +/// +/// @returns +/// The color in linear space. +/// +/// @ingroup GPU +FfxFloat32 ffxRec709FromLinear(FfxFloat32 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.018 * 4.5, 4.5, 0.45); + FfxFloat32x2 k = FfxFloat32x2(1.099, -0.099); + return clamp(j.x, color * j.y, pow(color, j.z) * k.x + k.y); +} + +/// Compute a Rec.709 color space. +/// +/// Rec.709 is used for some HDTVs. +/// +/// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +/// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +/// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +/// +/// @param [in] color The color to convert to Rec. 709. +/// +/// @returns +/// The color in linear space. +/// +/// @ingroup GPU +FfxFloat32x2 ffxRec709FromLinear(FfxFloat32x2 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.018 * 4.5, 4.5, 0.45); + FfxFloat32x2 k = FfxFloat32x2(1.099, -0.099); + return clamp(j.xx, color * j.yy, pow(color, j.zz) * k.xx + k.yy); +} + +/// Compute a Rec.709 color space. +/// +/// Rec.709 is used for some HDTVs. +/// +/// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +/// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +/// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +/// +/// @param [in] color The color to convert to Rec. 709. +/// +/// @returns +/// The color in linear space. +/// +/// @ingroup GPU +FfxFloat32x3 ffxRec709FromLinear(FfxFloat32x3 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.018 * 4.5, 4.5, 0.45); + FfxFloat32x2 k = FfxFloat32x2(1.099, -0.099); + return clamp(j.xxx, color * j.yyy, pow(color, j.zzz) * k.xxx + k.yyy); +} + +/// Compute a gamma value from a linear value. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// Note: 'rcpX' is '1/x', where the 'x' is what would be used in ffxLinearFromGamma. +/// +/// @param [in] value The value to convert to gamma space from linear. +/// @param [in] power The reciprocal of power value used for the gamma curve. +/// +/// @returns +/// A value in gamma space. +/// +/// @ingroup GPU +FfxFloat32 ffxGammaFromLinear(FfxFloat32 color, FfxFloat32 rcpX) +{ + return pow(color, FfxFloat32(rcpX)); +} + +/// Compute a gamma value from a linear value. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// Note: 'rcpX' is '1/x', where the 'x' is what would be used in ffxLinearFromGamma. +/// +/// @param [in] value The value to convert to gamma space from linear. +/// @param [in] power The reciprocal of power value used for the gamma curve. +/// +/// @returns +/// A value in gamma space. +/// +/// @ingroup GPU +FfxFloat32x2 ffxGammaFromLinear(FfxFloat32x2 color, FfxFloat32 rcpX) +{ + return pow(color, ffxBroadcast2(rcpX)); +} + +/// Compute a gamma value from a linear value. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// Note: 'rcpX' is '1/x', where the 'x' is what would be used in ffxLinearFromGamma. +/// +/// @param [in] value The value to convert to gamma space from linear. +/// @param [in] power The reciprocal of power value used for the gamma curve. +/// +/// @returns +/// A value in gamma space. +/// +/// @ingroup GPU +FfxFloat32x3 ffxGammaFromLinear(FfxFloat32x3 color, FfxFloat32 rcpX) +{ + return pow(color, ffxBroadcast3(rcpX)); +} + +/// Compute a PQ value from a linear value. +/// +/// @param [in] value The value to convert to PQ from linear. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32 ffxPQToLinear(FfxFloat32 x) +{ + FfxFloat32 p = pow(x, FfxFloat32(0.159302)); + return pow((FfxFloat32(0.835938) + FfxFloat32(18.8516) * p) / (FfxFloat32(1.0) + FfxFloat32(18.6875) * p), FfxFloat32(78.8438)); +} + +/// Compute a PQ value from a linear value. +/// +/// @param [in] value The value to convert to PQ from linear. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32x2 ffxPQToLinear(FfxFloat32x2 x) +{ + FfxFloat32x2 p = pow(x, ffxBroadcast2(0.159302)); + return pow((ffxBroadcast2(0.835938) + ffxBroadcast2(18.8516) * p) / (ffxBroadcast2(1.0) + ffxBroadcast2(18.6875) * p), ffxBroadcast2(78.8438)); +} + +/// Compute a PQ value from a linear value. +/// +/// @param [in] value The value to convert to PQ from linear. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32x3 ffxPQToLinear(FfxFloat32x3 x) +{ + FfxFloat32x3 p = pow(x, ffxBroadcast3(0.159302)); + return pow((ffxBroadcast3(0.835938) + ffxBroadcast3(18.8516) * p) / (ffxBroadcast3(1.0) + ffxBroadcast3(18.6875) * p), ffxBroadcast3(78.8438)); +} + +/// Compute a linear value from a SRGB value. +/// +/// @param [in] value The value to convert to linear from SRGB. +/// +/// @returns +/// A value in SRGB space. +/// +/// @ingroup GPU +FfxFloat32 ffxSrgbToLinear(FfxFloat32 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); + FfxFloat32x2 k = FfxFloat32x2(1.055, -0.055); + return clamp(j.x, color * j.y, pow(color, j.z) * k.x + k.y); +} + +/// Compute a linear value from a SRGB value. +/// +/// @param [in] value The value to convert to linear from SRGB. +/// +/// @returns +/// A value in SRGB space. +/// +/// @ingroup GPU +FfxFloat32x2 ffxSrgbToLinear(FfxFloat32x2 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); + FfxFloat32x2 k = FfxFloat32x2(1.055, -0.055); + return clamp(j.xx, color * j.yy, pow(color, j.zz) * k.xx + k.yy); +} + +/// Compute a linear value from a SRGB value. +/// +/// @param [in] value The value to convert to linear from SRGB. +/// +/// @returns +/// A value in SRGB space. +/// +/// @ingroup GPU +FfxFloat32x3 ffxSrgbToLinear(FfxFloat32x3 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); + FfxFloat32x2 k = FfxFloat32x2(1.055, -0.055); + return clamp(j.xxx, color * j.yyy, pow(color, j.zzz) * k.xxx + k.yyy); +} + +/// Compute a linear value from a REC.709 value. +/// +/// @param [in] color The value to convert to linear from REC.709. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32 ffxLinearFromRec709(FfxFloat32 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.081 / 4.5, 1.0 / 4.5, 1.0 / 0.45); + FfxFloat32x2 k = FfxFloat32x2(1.0 / 1.099, 0.099 / 1.099); + return ffxZeroOneSelect(ffxZeroOneIsSigned(color - j.x), color * j.y, pow(color * k.x + k.y, j.z)); +} + +/// Compute a linear value from a REC.709 value. +/// +/// @param [in] color The value to convert to linear from REC.709. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32x2 ffxLinearFromRec709(FfxFloat32x2 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.081 / 4.5, 1.0 / 4.5, 1.0 / 0.45); + FfxFloat32x2 k = FfxFloat32x2(1.0 / 1.099, 0.099 / 1.099); + return ffxZeroOneSelect(ffxZeroOneIsSigned(color - j.xx), color * j.yy, pow(color * k.xx + k.yy, j.zz)); +} + +/// Compute a linear value from a REC.709 value. +/// +/// @param [in] color The value to convert to linear from REC.709. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32x3 ffxLinearFromRec709(FfxFloat32x3 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.081 / 4.5, 1.0 / 4.5, 1.0 / 0.45); + FfxFloat32x2 k = FfxFloat32x2(1.0 / 1.099, 0.099 / 1.099); + return ffxZeroOneSelect(ffxZeroOneIsSigned(color - j.xxx), color * j.yyy, pow(color * k.xxx + k.yyy, j.zzz)); +} + +/// Compute a linear value from a value in a gamma space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] color The value to convert to linear in gamma space. +/// @param [in] power The power value used for the gamma curve. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32 ffxLinearFromGamma(FfxFloat32 color, FfxFloat32 power) +{ + return pow(color, FfxFloat32(power)); +} + +/// Compute a linear value from a value in a gamma space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] color The value to convert to linear in gamma space. +/// @param [in] power The power value used for the gamma curve. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32x2 ffxLinearFromGamma(FfxFloat32x2 color, FfxFloat32 power) +{ + return pow(color, ffxBroadcast2(power)); +} + +/// Compute a linear value from a value in a gamma space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] color The value to convert to linear in gamma space. +/// @param [in] power The power value used for the gamma curve. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32x3 ffxLinearFromGamma(FfxFloat32x3 color, FfxFloat32 power) +{ + return pow(color, ffxBroadcast3(power)); +} + +/// Compute a linear value from a value in a PQ space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] value The value to convert to linear in PQ space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32 ffxLinearFromPQ(FfxFloat32 x) +{ + FfxFloat32 p = pow(x, FfxFloat32(0.0126833)); + return pow(ffxSaturate(p - FfxFloat32(0.835938)) / (FfxFloat32(18.8516) - FfxFloat32(18.6875) * p), FfxFloat32(6.27739)); +} + +/// Compute a linear value from a value in a PQ space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] value The value to convert to linear in PQ space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32x2 ffxLinearFromPQ(FfxFloat32x2 x) +{ + FfxFloat32x2 p = pow(x, ffxBroadcast2(0.0126833)); + return pow(ffxSaturate(p - ffxBroadcast2(0.835938)) / (ffxBroadcast2(18.8516) - ffxBroadcast2(18.6875) * p), ffxBroadcast2(6.27739)); +} + +/// Compute a linear value from a value in a PQ space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] value The value to convert to linear in PQ space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32x3 ffxLinearFromPQ(FfxFloat32x3 x) +{ + FfxFloat32x3 p = pow(x, ffxBroadcast3(0.0126833)); + return pow(ffxSaturate(p - ffxBroadcast3(0.835938)) / (ffxBroadcast3(18.8516) - ffxBroadcast3(18.6875) * p), ffxBroadcast3(6.27739)); +} + +/// Compute a linear value from a value in a SRGB space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] value The value to convert to linear in SRGB space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32 ffxLinearFromSrgb(FfxFloat32 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.04045 / 12.92, 1.0 / 12.92, 2.4); + FfxFloat32x2 k = FfxFloat32x2(1.0 / 1.055, 0.055 / 1.055); + return ffxZeroOneSelect(ffxZeroOneIsSigned(color - j.x), color * j.y, pow(color * k.x + k.y, j.z)); +} + +/// Compute a linear value from a value in a SRGB space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] value The value to convert to linear in SRGB space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32x2 ffxLinearFromSrgb(FfxFloat32x2 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.04045 / 12.92, 1.0 / 12.92, 2.4); + FfxFloat32x2 k = FfxFloat32x2(1.0 / 1.055, 0.055 / 1.055); + return ffxZeroOneSelect(ffxZeroOneIsSigned(color - j.xx), color * j.yy, pow(color * k.xx + k.yy, j.zz)); +} + +/// Compute a linear value from a value in a SRGB space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] value The value to convert to linear in SRGB space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat32x3 ffxLinearFromSrgb(FfxFloat32x3 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.04045 / 12.92, 1.0 / 12.92, 2.4); + FfxFloat32x2 k = FfxFloat32x2(1.0 / 1.055, 0.055 / 1.055); + return ffxZeroOneSelect(ffxZeroOneIsSigned(color - j.xxx), color * j.yyy, pow(color * k.xxx + k.yyy, j.zzz)); +} + +/// A remapping of 64x1 to 8x8 imposing rotated 2x2 pixel quads in quad linear. +/// +/// 543210 +/// ====== +/// ..xxx. +/// yy...y +/// +/// @param [in] a The input 1D coordinates to remap. +/// +/// @returns +/// The remapped 2D coordinates. +/// +/// @ingroup GPU +FfxUInt32x2 ffxRemapForQuad(FfxUInt32 a) +{ + return FfxUInt32x2(bitfieldExtract(a, 1u, 3u), bitfieldInsertMask(bitfieldExtract(a, 3u, 3u), a, 1u)); +} + +/// A helper function performing a remap 64x1 to 8x8 remapping which is necessary for 2D wave reductions. +/// +/// The 64-wide lane indices to 8x8 remapping is performed as follows: +/// +/// 00 01 08 09 10 11 18 19 +/// 02 03 0a 0b 12 13 1a 1b +/// 04 05 0c 0d 14 15 1c 1d +/// 06 07 0e 0f 16 17 1e 1f +/// 20 21 28 29 30 31 38 39 +/// 22 23 2a 2b 32 33 3a 3b +/// 24 25 2c 2d 34 35 3c 3d +/// 26 27 2e 2f 36 37 3e 3f +/// +/// @param [in] a The input 1D coordinate to remap. +/// +/// @returns +/// The remapped 2D coordinates. +/// +/// @ingroup GPU +FfxUInt32x2 ffxRemapForWaveReduction(FfxUInt32 a) +{ + return FfxUInt32x2(bitfieldInsertMask(bitfieldExtract(a, 2u, 3u), a, 1u), bitfieldInsertMask(bitfieldExtract(a, 3u, 3u), bitfieldExtract(a, 1u, 2u), 2u)); +} diff --git a/thirdparty/amd-fsr2/shaders/ffx_core_gpu_common_half.h b/thirdparty/amd-fsr2/shaders/ffx_core_gpu_common_half.h new file mode 100644 index 00000000000..c46ccb36575 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_core_gpu_common_half.h @@ -0,0 +1,2978 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#if FFX_HALF +#if FFX_HLSL_6_2 +/// A define value for 16bit positive infinity. +/// +/// @ingroup GPU +#define FFX_POSITIVE_INFINITY_HALF FFX_TO_FLOAT16((uint16_t)0x7c00u) + +/// A define value for 16bit negative infinity. +/// +/// @ingroup GPU +#define FFX_NEGATIVE_INFINITY_HALF FFX_TO_FLOAT16((uint16_t)0xfc00u) +#else +/// A define value for 16bit positive infinity. +/// +/// @ingroup GPU +#define FFX_POSITIVE_INFINITY_HALF FFX_TO_FLOAT16(0x7c00u) + +/// A define value for 16bit negative infinity. +/// +/// @ingroup GPU +#define FFX_NEGATIVE_INFINITY_HALF FFX_TO_FLOAT16(0xfc00u) +#endif // FFX_HLSL_6_2 + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat16 ffxMin(FfxFloat16 x, FfxFloat16 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat16x2 ffxMin(FfxFloat16x2 x, FfxFloat16x2 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat16x3 ffxMin(FfxFloat16x3 x, FfxFloat16x3 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat16x4 ffxMin(FfxFloat16x4 x, FfxFloat16x4 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt16 ffxMin(FfxInt16 x, FfxInt16 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt16x2 ffxMin(FfxInt16x2 x, FfxInt16x2 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt16x3 ffxMin(FfxInt16x3 x, FfxInt16x3 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt16x4 ffxMin(FfxInt16x4 x, FfxInt16x4 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt16 ffxMin(FfxUInt16 x, FfxUInt16 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt16x2 ffxMin(FfxUInt16x2 x, FfxUInt16x2 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt16x3 ffxMin(FfxUInt16x3 x, FfxUInt16x3 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt16x4 ffxMin(FfxUInt16x4 x, FfxUInt16x4 y) +{ + return min(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat16 ffxMax(FfxFloat16 x, FfxFloat16 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat16x2 ffxMax(FfxFloat16x2 x, FfxFloat16x2 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat16x3 ffxMax(FfxFloat16x3 x, FfxFloat16x3 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxFloat16x4 ffxMax(FfxFloat16x4 x, FfxFloat16x4 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt16 ffxMax(FfxInt16 x, FfxInt16 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt16x2 ffxMax(FfxInt16x2 x, FfxInt16x2 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt16x3 ffxMax(FfxInt16x3 x, FfxInt16x3 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxInt16x4 ffxMax(FfxInt16x4 x, FfxInt16x4 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt16 ffxMax(FfxUInt16 x, FfxUInt16 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt16x2 ffxMax(FfxUInt16x2 x, FfxUInt16x2 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt16x3 ffxMax(FfxUInt16x3 x, FfxUInt16x3 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPU +FfxUInt16x4 ffxMax(FfxUInt16x4 x, FfxUInt16x4 y) +{ + return max(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPU +FfxFloat16 ffxPow(FfxFloat16 x, FfxFloat16 y) +{ + return pow(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPU +FfxFloat16x2 ffxPow(FfxFloat16x2 x, FfxFloat16x2 y) +{ + return pow(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPU +FfxFloat16x3 ffxPow(FfxFloat16x3 x, FfxFloat16x3 y) +{ + return pow(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPU +FfxFloat16x4 ffxPow(FfxFloat16x4 x, FfxFloat16x4 y) +{ + return pow(x, y); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPU +FfxFloat16 ffxSqrt(FfxFloat16 x) +{ + return sqrt(x); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPU +FfxFloat16x2 ffxSqrt(FfxFloat16x2 x) +{ + return sqrt(x); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPU +FfxFloat16x3 ffxSqrt(FfxFloat16x3 x) +{ + return sqrt(x); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPU +FfxFloat16x4 ffxSqrt(FfxFloat16x4 x) +{ + return sqrt(x); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPU +FfxFloat16 ffxCopySignBitHalf(FfxFloat16 d, FfxFloat16 s) +{ + return FFX_TO_FLOAT16(FFX_TO_UINT16(d) | (FFX_TO_UINT16(s) & FFX_BROADCAST_UINT16(0x8000u))); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPU +FfxFloat16x2 ffxCopySignBitHalf(FfxFloat16x2 d, FfxFloat16x2 s) +{ + return FFX_TO_FLOAT16X2(FFX_TO_UINT16X2(d) | (FFX_TO_UINT16X2(s) & FFX_BROADCAST_UINT16X2(0x8000u))); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPU +FfxFloat16x3 ffxCopySignBitHalf(FfxFloat16x3 d, FfxFloat16x3 s) +{ + return FFX_TO_FLOAT16X3(FFX_TO_UINT16X3(d) | (FFX_TO_UINT16X3(s) & FFX_BROADCAST_UINT16X3(0x8000u))); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPU +FfxFloat16x4 ffxCopySignBitHalf(FfxFloat16x4 d, FfxFloat16x4 s) +{ + return FFX_TO_FLOAT16X4(FFX_TO_UINT16X4(d) | (FFX_TO_UINT16X4(s) & FFX_BROADCAST_UINT16X4(0x8000u))); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against 0. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or position. +/// +/// @ingroup GPU +FfxFloat16 ffxIsSignedHalf(FfxFloat16 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against 0. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or position. +/// +/// @ingroup GPU +FfxFloat16x2 ffxIsSignedHalf(FfxFloat16x2 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16X2(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against 0. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or position. +/// +/// @ingroup GPU +FfxFloat16x3 ffxIsSignedHalf(FfxFloat16x3 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16X3(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against 0. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or position. +/// +/// @ingroup GPU +FfxFloat16x4 ffxIsSignedHalf(FfxFloat16x4 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16X4(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPU +FfxFloat16 ffxIsGreaterThanZeroHalf(FfxFloat16 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16(FFX_POSITIVE_INFINITY_HALF)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPU +FfxFloat16x2 ffxIsGreaterThanZeroHalf(FfxFloat16x2 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16X2(FFX_POSITIVE_INFINITY_HALF)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPU +FfxFloat16x3 ffxIsGreaterThanZeroHalf(FfxFloat16x3 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16X3(FFX_POSITIVE_INFINITY_HALF)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPU +FfxFloat16x4 ffxIsGreaterThanZeroHalf(FfxFloat16x4 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16X4(FFX_POSITIVE_INFINITY_HALF)); +} + +/// Convert a 16bit floating point value to sortable integer. +/// +/// - If sign bit=0, flip the sign bit (positives). +/// - If sign bit=1, flip all bits (negatives). +/// +/// The function has the side effects that: +/// - Larger integers are more positive values. +/// - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage). +/// +/// @param [in] x The floating point value to make sortable. +/// +/// @returns +/// The sortable integer value. +/// +/// @ingroup GPU +FfxUInt16 ffxFloatToSortableIntegerHalf(FfxUInt16 x) +{ + return x ^ ((ffxBitShiftRightHalf(x, FFX_BROADCAST_UINT16(15))) | FFX_BROADCAST_UINT16(0x8000)); +} + +/// Convert a sortable integer to a 16bit floating point value. +/// +/// The function has the side effects that: +/// - If sign bit=1, flip the sign bit (positives). +/// - If sign bit=0, flip all bits (negatives). +/// +/// @param [in] x The sortable integer value to make floating point. +/// +/// @returns +/// The floating point value. +/// +/// @ingroup GPU +FfxUInt16 ffxSortableIntegerToFloatHalf(FfxUInt16 x) +{ + return x ^ ((~ffxBitShiftRightHalf(x, FFX_BROADCAST_UINT16(15))) | FFX_BROADCAST_UINT16(0x8000)); +} + +/// Convert a pair of 16bit floating point values to a pair of sortable integers. +/// +/// - If sign bit=0, flip the sign bit (positives). +/// - If sign bit=1, flip all bits (negatives). +/// +/// The function has the side effects that: +/// - Larger integers are more positive values. +/// - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage). +/// +/// @param [in] x The floating point values to make sortable. +/// +/// @returns +/// The sortable integer values. +/// +/// @ingroup GPU +FfxUInt16x2 ffxFloatToSortableIntegerHalf(FfxUInt16x2 x) +{ + return x ^ ((ffxBitShiftRightHalf(x, FFX_BROADCAST_UINT16X2(15))) | FFX_BROADCAST_UINT16X2(0x8000)); +} + +/// Convert a pair of sortable integers to a pair of 16bit floating point values. +/// +/// The function has the side effects that: +/// - If sign bit=1, flip the sign bit (positives). +/// - If sign bit=0, flip all bits (negatives). +/// +/// @param [in] x The sortable integer values to make floating point. +/// +/// @returns +/// The floating point values. +/// +/// @ingroup GPU +FfxUInt16x2 ffxSortableIntegerToFloatHalf(FfxUInt16x2 x) +{ + return x ^ ((~ffxBitShiftRightHalf(x, FFX_BROADCAST_UINT16X2(15))) | FFX_BROADCAST_UINT16X2(0x8000)); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// [Zero] Y0 [Zero] X0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxPackBytesZeroY0ZeroX0(FfxUInt32x2 i) +{ + return ((i.x) & 0xffu) | ((i.y << 16) & 0xff0000u); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// [Zero] Y1 [Zero] X1 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxPackBytesZeroY1ZeroX1(FfxUInt32x2 i) +{ + return ((i.x >> 8) & 0xffu) | ((i.y << 8) & 0xff0000u); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// [Zero] Y2 [Zero] X2 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxPackBytesZeroY2ZeroX2(FfxUInt32x2 i) +{ + return ((i.x >> 16) & 0xffu) | ((i.y) & 0xff0000u); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// [Zero] Y3 [Zero] X3 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxPackBytesZeroY3ZeroX3(FfxUInt32x2 i) +{ + return ((i.x >> 24) & 0xffu) | ((i.y >> 8) & 0xff0000u); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y3 Y2 Y1 X0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxPackBytesY3Y2Y1X0(FfxUInt32x2 i) +{ + return ((i.x) & 0x000000ffu) | (i.y & 0xffffff00u); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y3 Y2 Y1 X2 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxPackBytesY3Y2Y1X2(FfxUInt32x2 i) +{ + return ((i.x >> 16) & 0x000000ffu) | (i.y & 0xffffff00u); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y3 Y2 X0 Y0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxPackBytesY3Y2X0Y0(FfxUInt32x2 i) +{ + return ((i.x << 8) & 0x0000ff00u) | (i.y & 0xffff00ffu); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y3 Y2 X2 Y0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxPackBytesY3Y2X2Y0(FfxUInt32x2 i) +{ + return ((i.x >> 8) & 0x0000ff00u) | (i.y & 0xffff00ffu); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y3 X0 Y1 Y0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxPackBytesY3X0Y1Y0(FfxUInt32x2 i) +{ + return ((i.x << 16) & 0x00ff0000u) | (i.y & 0xff00ffffu); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y3 X2 Y1 Y0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxPackBytesY3X2Y1Y0(FfxUInt32x2 i) +{ + return ((i.x) & 0x00ff0000u) | (i.y & 0xff00ffffu); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// X0 Y2 Y1 Y0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxPackBytesX0Y2Y1Y0(FfxUInt32x2 i) +{ + return ((i.x << 24) & 0xff000000u) | (i.y & 0x00ffffffu); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// X2 Y2 Y1 Y0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxPackBytesX2Y2Y1Y0(FfxUInt32x2 i) +{ + return ((i.x << 8) & 0xff000000u) | (i.y & 0x00ffffffu); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y2 X2 Y0 X0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxPackBytesY2X2Y0X0(FfxUInt32x2 i) +{ + return ((i.x) & 0x00ff00ffu) | ((i.y << 8) & 0xff00ff00u); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y2 Y0 X2 X0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPU +FfxUInt32 ffxPackBytesY2Y0X2X0(FfxUInt32x2 i) +{ + return (((i.x) & 0xffu) | ((i.x >> 8) & 0xff00u) | ((i.y << 16) & 0xff0000u) | ((i.y << 8) & 0xff000000u)); +} + +/// Takes two Float16x2 values x and y, normalizes them and builds a single Uint16x2 value in the format {{x0,y0},{x1,y1}}. +/// +/// @param [in] x The first float16x2 value to pack. +/// @param [in] y The second float16x2 value to pack. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPU +FfxUInt16x2 ffxPackX0Y0X1Y1UnsignedToUint16x2(FfxFloat16x2 x, FfxFloat16x2 y) +{ + x *= FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0); + y *= FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0); + return FFX_UINT32_TO_UINT16X2(ffxPackBytesY2X2Y0X0(FfxUInt32x2(FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(x)), FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(y))))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[0:7], +/// d.y[0:7] into r.y[0:7], i.x[8:15] into r.x[8:15], r.y[8:15] and i.y[0:15] into r.x[16:31], r.y[16:31] using 3 ops. +/// +/// r=ffxPermuteUByte0Float16x2ToUint2(d,i) +/// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +/// Where 'k1' is an SGPR with 0x???? +/// Where 'k2' is an SGPR with 0x???? +/// V_PK_FMA_F16 i,i,k0.x,0 +/// V_PERM_B32 r.x,i,i,k1 +/// V_PERM_B32 r.y,i,i,k2 +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPU +FfxUInt32x2 ffxPermuteUByte0Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0))); + return FfxUInt32x2(ffxPackBytesY3Y2Y1X0(FfxUInt32x2(d.x, b)), ffxPackBytesY3Y2Y1X2(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[8:15], +/// d.y[0:7] into r.y[8:15], i.x[0:7] into r.x[0:7], r.y[0:7] and i.y[0:15] into r.x[16:31], r.y[16:31] using 3 ops. +/// +/// r=ffxPermuteUByte1Float16x2ToUint2(d,i) +/// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +/// Where 'k1' is an SGPR with 0x???? +/// Where 'k2' is an SGPR with 0x???? +/// V_PK_FMA_F16 i,i,k0.x,0 +/// V_PERM_B32 r.x,i,i,k1 +/// V_PERM_B32 r.y,i,i,k2 +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPU +FfxUInt32x2 ffxPermuteUByte1Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0))); + return FfxUInt32x2(ffxPackBytesY3Y2X0Y0(FfxUInt32x2(d.x, b)), ffxPackBytesY3Y2X2Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[16:23], +/// d.y[0:7] into r.y[16:23], i.x[0:15] into r.x[0:15], r.y[0:15] and i.y[8:15] into r.x[24:31], r.y[24:31] using 3 ops. +/// +/// r=ffxPermuteUByte2Float16x2ToUint2(d,i) +/// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +/// Where 'k1' is an SGPR with 0x???? +/// Where 'k2' is an SGPR with 0x???? +/// V_PK_FMA_F16 i,i,k0.x,0 +/// V_PERM_B32 r.x,i,i,k1 +/// V_PERM_B32 r.y,i,i,k2 +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPU +FfxUInt32x2 ffxPermuteUByte2Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0))); + return FfxUInt32x2(ffxPackBytesY3X0Y1Y0(FfxUInt32x2(d.x, b)), ffxPackBytesY3X2Y1Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[24:31], +/// d.y[0:7] into r.y[24:31], i.x[0:15] into r.x[0:15], r.y[0:15] and i.y[0:7] into r.x[16:23], r.y[16:23] using 3 ops. +/// +/// r=ffxPermuteUByte3Float16x2ToUint2(d,i) +/// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +/// Where 'k1' is an SGPR with 0x???? +/// Where 'k2' is an SGPR with 0x???? +/// V_PK_FMA_F16 i,i,k0.x,0 +/// V_PERM_B32 r.x,i,i,k1 +/// V_PERM_B32 r.y,i,i,k2 +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPU +FfxUInt32x2 ffxPermuteUByte3Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0))); + return FfxUInt32x2(ffxPackBytesX0Y2Y1Y0(FfxUInt32x2(d.x, b)), ffxPackBytesX2Y2Y1Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[0:7] into r.x[0:7] and i.y[0:7] into r.y[0:7] using 2 ops. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPU +FfxFloat16x2 ffxPermuteUByte0Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY0ZeroX0(i))) * FFX_BROADCAST_FLOAT16X2(32768.0); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[8:15] into r.x[0:7] and i.y[8:15] into r.y[0:7] using 2 ops. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPU +FfxFloat16x2 ffxPermuteUByte1Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY1ZeroX1(i))) * FFX_BROADCAST_FLOAT16X2(32768.0); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[16:23] into r.x[0:7] and i.y[16:23] into r.y[0:7] using 2 ops. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPU +FfxFloat16x2 ffxPermuteUByte2Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY2ZeroX2(i))) * FFX_BROADCAST_FLOAT16X2(32768.0); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[24:31] into r.x[0:7] and i.y[24:31] into r.y[0:7] using 2 ops. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPU +FfxFloat16x2 ffxPermuteUByte3Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY3ZeroX3(i))) * FFX_BROADCAST_FLOAT16X2(32768.0); +} + +/// Takes two Float16x2 values x and y, normalizes them and builds a single Uint16x2 value in the format {{x0,y0},{x1,y1}}. +/// +/// @param [in] x The first float16x2 value to pack. +/// @param [in] y The second float16x2 value to pack. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPU +FfxUInt16x2 ffxPackX0Y0X1Y1SignedToUint16x2(FfxFloat16x2 x, FfxFloat16x2 y) +{ + x = x * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0); + y = y * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0); + return FFX_UINT32_TO_UINT16X2(ffxPackBytesY2X2Y0X0(FfxUInt32x2(FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(x)), FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(y))))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[0:7], +/// d.y[0:7] into r.y[0:7], i.x[8:15] into r.x[8:15], r.y[8:15] and i.y[0:15] into r.x[16:31], r.y[16:31] using 3 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPU +FfxUInt32x2 ffxPermuteSByte0Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))); + return FfxUInt32x2(ffxPackBytesY3Y2Y1X0(FfxUInt32x2(d.x, b)), ffxPackBytesY3Y2Y1X2(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[8:15], +/// d.y[0:7] into r.y[8:15], i.x[0:7] into r.x[0:7], r.y[0:7] and i.y[0:15] into r.x[16:31], r.y[16:31] using 3 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPU +FfxUInt32x2 ffxPermuteSByte1Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))); + return FfxUInt32x2(ffxPackBytesY3Y2X0Y0(FfxUInt32x2(d.x, b)), ffxPackBytesY3Y2X2Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[16:23], +/// d.y[0:7] into r.y[16:23], i.x[0:15] into r.x[0:15], r.y[0:15] and i.y[8:15] into r.x[24:31], r.y[24:31] using 3 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPU +FfxUInt32x2 ffxPermuteSByte2Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))); + return FfxUInt32x2(ffxPackBytesY3X0Y1Y0(FfxUInt32x2(d.x, b)), ffxPackBytesY3X2Y1Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[24:31], +/// d.y[0:7] into r.y[24:31], i.x[0:15] into r.x[0:15], r.y[0:15] and i.y[0:7] into r.x[16:23], r.y[16:23] using 3 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPU +FfxUInt32x2 ffxPermuteSByte3Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))); + return FfxUInt32x2(ffxPackBytesX0Y2Y1Y0(FfxUInt32x2(d.x, b)), ffxPackBytesX2Y2Y1Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[0:7], +/// d.y[0:7] into r.y[0:7], i.x[8:15] into r.x[8:15], r.y[8:15] and i.y[0:15] into r.x[16:31], r.y[16:31] using 3 ops. +/// +/// Zero-based flips the MSB bit of the byte (making 128 "exact zero" actually zero). +/// This is useful if there is a desire for cleared values to decode as zero. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPU +FfxUInt32x2 ffxPermuteZeroBasedSByte0Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))) ^ 0x00800080u; + return FfxUInt32x2(ffxPackBytesY3Y2Y1X0(FfxUInt32x2(d.x, b)), ffxPackBytesY3Y2Y1X2(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[8:15], +/// d.y[0:7] into r.y[8:15], i.x[0:7] into r.x[0:7], r.y[0:7] and i.y[0:15] into r.x[16:31], r.y[16:31] using 3 ops. +/// +/// Zero-based flips the MSB bit of the byte (making 128 "exact zero" actually zero). +/// This is useful if there is a desire for cleared values to decode as zero. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPU +FfxUInt32x2 ffxPermuteZeroBasedSByte1Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))) ^ 0x00800080u; + return FfxUInt32x2(ffxPackBytesY3Y2X0Y0(FfxUInt32x2(d.x, b)), ffxPackBytesY3Y2X2Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[16:23], +/// d.y[0:7] into r.y[16:23], i.x[0:15] into r.x[0:15], r.y[0:15] and i.y[8:15] into r.x[24:31], r.y[24:31] using 3 ops. +/// +/// Zero-based flips the MSB bit of the byte (making 128 "exact zero" actually zero). +/// This is useful if there is a desire for cleared values to decode as zero. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPU +FfxUInt32x2 ffxPermuteZeroBasedSByte2Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))) ^ 0x00800080u; + return FfxUInt32x2(ffxPackBytesY3X0Y1Y0(FfxUInt32x2(d.x, b)), ffxPackBytesY3X2Y1Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[24:31], +/// d.y[0:7] into r.y[24:31], i.x[0:15] into r.x[0:15], r.y[0:15] and i.y[0:7] into r.x[16:23], r.y[16:23] using 3 ops. +/// +/// Zero-based flips the MSB bit of the byte (making 128 "exact zero" actually zero). +/// This is useful if there is a desire for cleared values to decode as zero. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPU +FfxUInt32x2 ffxPermuteZeroBasedSByte3Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))) ^ 0x00800080u; + return FfxUInt32x2(ffxPackBytesX0Y2Y1Y0(FfxUInt32x2(d.x, b)), ffxPackBytesX2Y2Y1Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[0:7] into r.x[0:7] and i.y[0:7] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPU +FfxFloat16x2 ffxPermuteSByte0Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY0ZeroX0(i))) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[8:15] into r.x[0:7] and i.y[8:15] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPU +FfxFloat16x2 ffxPermuteSByte1Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY1ZeroX1(i))) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[16:23] into r.x[0:7] and i.y[16:23] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPU +FfxFloat16x2 ffxPermuteSByte2Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY2ZeroX2(i))) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[24:31] into r.x[0:7] and i.y[24:31] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPU +FfxFloat16x2 ffxPermuteSByte3Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY3ZeroX3(i))) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[0:7] into r.x[0:7] and i.y[0:7] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPU +FfxFloat16x2 ffxPermuteZeroBasedSByte0Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY0ZeroX0(i) ^ 0x00800080u)) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[8:15] into r.x[0:7] and i.y[8:15] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPU +FfxFloat16x2 ffxPermuteZeroBasedSByte1Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY1ZeroX1(i) ^ 0x00800080u)) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[16:23] into r.x[0:7] and i.y[16:23] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPU +FfxFloat16x2 ffxPermuteZeroBasedSByte2Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY2ZeroX2(i) ^ 0x00800080u)) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[24:31] into r.x[0:7] and i.y[24:31] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPU +FfxFloat16x2 ffxPermuteZeroBasedSByte3Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY3ZeroX3(i) ^ 0x00800080u)) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Calculate a half-precision low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat16 ffxApproximateSqrtHalf(FfxFloat16 a) +{ + return FFX_TO_FLOAT16((FFX_TO_UINT16(a) >> FFX_BROADCAST_UINT16(1)) + FFX_BROADCAST_UINT16(0x1de2)); +} + +/// Calculate a half-precision low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat16x2 ffxApproximateSqrtHalf(FfxFloat16x2 a) +{ + return FFX_TO_FLOAT16X2((FFX_TO_UINT16X2(a) >> FFX_BROADCAST_UINT16X2(1)) + FFX_BROADCAST_UINT16X2(0x1de2)); +} + +/// Calculate a half-precision low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat16x3 ffxApproximateSqrtHalf(FfxFloat16x3 a) +{ + return FFX_TO_FLOAT16X3((FFX_TO_UINT16X3(a) >> FFX_BROADCAST_UINT16X3(1)) + FFX_BROADCAST_UINT16X3(0x1de2)); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat16 ffxApproximateReciprocalHalf(FfxFloat16 a) +{ + return FFX_TO_FLOAT16(FFX_BROADCAST_UINT16(0x7784) - FFX_TO_UINT16(a)); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat16x2 ffxApproximateReciprocalHalf(FfxFloat16x2 a) +{ + return FFX_TO_FLOAT16X2(FFX_BROADCAST_UINT16X2(0x7784) - FFX_TO_UINT16X2(a)); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat16x3 ffxApproximateReciprocalHalf(FfxFloat16x3 a) +{ + return FFX_TO_FLOAT16X3(FFX_BROADCAST_UINT16X3(0x7784) - FFX_TO_UINT16X3(a)); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat16x4 ffxApproximateReciprocalHalf(FfxFloat16x4 a) +{ + return FFX_TO_FLOAT16X4(FFX_BROADCAST_UINT16X4(0x7784) - FFX_TO_UINT16X4(a)); +} + +/// Calculate a half-precision medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPU +FfxFloat16 ffxApproximateReciprocalMediumHalf(FfxFloat16 a) +{ + FfxFloat16 b = FFX_TO_FLOAT16(FFX_BROADCAST_UINT16(0x778d) - FFX_TO_UINT16(a)); + return b * (-b * a + FFX_BROADCAST_FLOAT16(2.0)); +} + +/// Calculate a half-precision medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPU +FfxFloat16x2 ffxApproximateReciprocalMediumHalf(FfxFloat16x2 a) +{ + FfxFloat16x2 b = FFX_TO_FLOAT16X2(FFX_BROADCAST_UINT16X2(0x778d) - FFX_TO_UINT16X2(a)); + return b * (-b * a + FFX_BROADCAST_FLOAT16X2(2.0)); +} + +/// Calculate a half-precision medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPU +FfxFloat16x3 ffxApproximateReciprocalMediumHalf(FfxFloat16x3 a) +{ + FfxFloat16x3 b = FFX_TO_FLOAT16X3(FFX_BROADCAST_UINT16X3(0x778d) - FFX_TO_UINT16X3(a)); + return b * (-b * a + FFX_BROADCAST_FLOAT16X3(2.0)); +} + +/// Calculate a half-precision medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPU +FfxFloat16x4 ffxApproximateReciprocalMediumHalf(FfxFloat16x4 a) +{ + FfxFloat16x4 b = FFX_TO_FLOAT16X4(FFX_BROADCAST_UINT16X4(0x778d) - FFX_TO_UINT16X4(a)); + return b * (-b * a + FFX_BROADCAST_FLOAT16X4(2.0)); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal of the square root for. +/// +/// @returns +/// An approximation of the reciprocal of the square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat16 ffxApproximateReciprocalSquareRootHalf(FfxFloat16 a) +{ + return FFX_TO_FLOAT16(FFX_BROADCAST_UINT16(0x59a3) - (FFX_TO_UINT16(a) >> FFX_BROADCAST_UINT16(1))); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal of the square root for. +/// +/// @returns +/// An approximation of the reciprocal of the square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat16x2 ffxApproximateReciprocalSquareRootHalf(FfxFloat16x2 a) +{ + return FFX_TO_FLOAT16X2(FFX_BROADCAST_UINT16X2(0x59a3) - (FFX_TO_UINT16X2(a) >> FFX_BROADCAST_UINT16X2(1))); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal of the square root for. +/// +/// @returns +/// An approximation of the reciprocal of the square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat16x3 ffxApproximateReciprocalSquareRootHalf(FfxFloat16x3 a) +{ + return FFX_TO_FLOAT16X3(FFX_BROADCAST_UINT16X3(0x59a3) - (FFX_TO_UINT16X3(a) >> FFX_BROADCAST_UINT16X3(1))); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal of the square root for. +/// +/// @returns +/// An approximation of the reciprocal of the square root, estimated to low quality. +/// +/// @ingroup GPU +FfxFloat16x4 ffxApproximateReciprocalSquareRootHalf(FfxFloat16x4 a) +{ + return FFX_TO_FLOAT16X4(FFX_BROADCAST_UINT16X4(0x59a3) - (FFX_TO_UINT16X4(a) >> FFX_BROADCAST_UINT16X4(1))); +} + +/// An approximation of sine. +/// +/// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +/// is {-1/4 to 1/4} representing {-1 to 1}. +/// +/// @param [in] x The value to calculate approximate sine for. +/// +/// @returns +/// The approximate sine of value. +FfxFloat16 ffxParabolicSinHalf(FfxFloat16 x) +{ + return x * abs(x) - x; +} + +/// An approximation of sine. +/// +/// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +/// is {-1/4 to 1/4} representing {-1 to 1}. +/// +/// @param [in] x The value to calculate approximate sine for. +/// +/// @returns +/// The approximate sine of value. +FfxFloat16x2 ffxParabolicSinHalf(FfxFloat16x2 x) +{ + return x * abs(x) - x; +} + +/// An approximation of cosine. +/// +/// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +/// is {-1/4 to 1/4} representing {-1 to 1}. +/// +/// @param [in] x The value to calculate approximate cosine for. +/// +/// @returns +/// The approximate cosine of value. +FfxFloat16 ffxParabolicCosHalf(FfxFloat16 x) +{ + x = ffxFract(x * FFX_BROADCAST_FLOAT16(0.5) + FFX_BROADCAST_FLOAT16(0.75)); + x = x * FFX_BROADCAST_FLOAT16(2.0) - FFX_BROADCAST_FLOAT16(1.0); + return ffxParabolicSinHalf(x); +} + +/// An approximation of cosine. +/// +/// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +/// is {-1/4 to 1/4} representing {-1 to 1}. +/// +/// @param [in] x The value to calculate approximate cosine for. +/// +/// @returns +/// The approximate cosine of value. +FfxFloat16x2 ffxParabolicCosHalf(FfxFloat16x2 x) +{ + x = ffxFract(x * FFX_BROADCAST_FLOAT16X2(0.5) + FFX_BROADCAST_FLOAT16X2(0.75)); + x = x * FFX_BROADCAST_FLOAT16X2(2.0) - FFX_BROADCAST_FLOAT16X2(1.0); + return ffxParabolicSinHalf(x); +} + +/// An approximation of both sine and cosine. +/// +/// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +/// is {-1/4 to 1/4} representing {-1 to 1}. +/// +/// @param [in] x The value to calculate approximate cosine for. +/// +/// @returns +/// A FfxFloat32x2 containing approximations of both sine and cosine of value. +FfxFloat16x2 ffxParabolicSinCosHalf(FfxFloat16 x) +{ + FfxFloat16 y = ffxFract(x * FFX_BROADCAST_FLOAT16(0.5) + FFX_BROADCAST_FLOAT16(0.75)); + y = y * FFX_BROADCAST_FLOAT16(2.0) - FFX_BROADCAST_FLOAT16(1.0); + return ffxParabolicSinHalf(FfxFloat16x2(x, y)); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPU +FfxUInt16 ffxZeroOneAndHalf(FfxUInt16 x, FfxUInt16 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPU +FfxUInt16x2 ffxZeroOneAndHalf(FfxUInt16x2 x, FfxUInt16x2 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPU +FfxUInt16x3 ffxZeroOneAndHalf(FfxUInt16x3 x, FfxUInt16x3 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPU +FfxUInt16x4 ffxZeroOneAndHalf(FfxUInt16x4 x, FfxUInt16x4 y) +{ + return min(x, y); +} + +/// Conditional free logic NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// @param [in] y The second value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPU +FfxUInt16 ffxZeroOneNotHalf(FfxUInt16 x) +{ + return x ^ FFX_BROADCAST_UINT16(1); +} + +/// Conditional free logic NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// @param [in] y The second value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPU +FfxUInt16x2 ffxZeroOneNotHalf(FfxUInt16x2 x) +{ + return x ^ FFX_BROADCAST_UINT16X2(1); +} + +/// Conditional free logic NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// @param [in] y The second value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPU +FfxUInt16x3 ffxZeroOneNotHalf(FfxUInt16x3 x) +{ + return x ^ FFX_BROADCAST_UINT16X3(1); +} + +/// Conditional free logic NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// @param [in] y The second value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPU +FfxUInt16x4 ffxZeroOneNotHalf(FfxUInt16x4 x) +{ + return x ^ FFX_BROADCAST_UINT16X4(1); +} + +/// Conditional free logic OR operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxUInt16 ffxZeroOneOrHalf(FfxUInt16 x, FfxUInt16 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxUInt16x2 ffxZeroOneOrHalf(FfxUInt16x2 x, FfxUInt16x2 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxUInt16x3 ffxZeroOneOrHalf(FfxUInt16x3 x, FfxUInt16x3 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxUInt16x4 ffxZeroOneOrHalf(FfxUInt16x4 x, FfxUInt16x4 y) +{ + return max(x, y); +} + +/// Convert a half-precision FfxFloat32 value between 0.0f and 1.0f to a half-precision Uint. +/// +/// @param [in] x The value to converted to a Uint. +/// +/// @returns +/// The converted Uint value. +/// +/// @ingroup GPU +FfxUInt16 ffxZeroOneFloat16ToUint16(FfxFloat16 x) +{ + return FFX_TO_UINT16(x * FFX_TO_FLOAT16(FFX_TO_UINT16(1))); +} + +/// Convert a half-precision FfxFloat32 value between 0.0f and 1.0f to a half-precision Uint. +/// +/// @param [in] x The value to converted to a Uint. +/// +/// @returns +/// The converted Uint value. +/// +/// @ingroup GPU +FfxUInt16x2 ffxZeroOneFloat16x2ToUint16x2(FfxFloat16x2 x) +{ + return FFX_TO_UINT16X2(x * FFX_TO_FLOAT16X2(FfxUInt16x2(1, 1))); +} + +/// Convert a half-precision FfxFloat32 value between 0.0f and 1.0f to a half-precision Uint. +/// +/// @param [in] x The value to converted to a Uint. +/// +/// @returns +/// The converted Uint value. +/// +/// @ingroup GPU +FfxUInt16x3 ffxZeroOneFloat16x3ToUint16x3(FfxFloat16x3 x) +{ + return FFX_TO_UINT16X3(x * FFX_TO_FLOAT16X3(FfxUInt16x3(1, 1, 1))); +} + +/// Convert a half-precision FfxFloat32 value between 0.0f and 1.0f to a half-precision Uint. +/// +/// @param [in] x The value to converted to a Uint. +/// +/// @returns +/// The converted Uint value. +/// +/// @ingroup GPU +FfxUInt16x4 ffxZeroOneFloat16x4ToUint16x4(FfxFloat16x4 x) +{ + return FFX_TO_UINT16X4(x * FFX_TO_FLOAT16X4(FfxUInt16x4(1, 1, 1, 1))); +} + +/// Convert a half-precision FfxUInt32 value between 0 and 1 to a half-precision FfxFloat32. +/// +/// @param [in] x The value to converted to a half-precision FfxFloat32. +/// +/// @returns +/// The converted half-precision FfxFloat32 value. +/// +/// @ingroup GPU +FfxFloat16 ffxZeroOneUint16ToFloat16(FfxUInt16 x) +{ + return FFX_TO_FLOAT16(x * FFX_TO_UINT16(FFX_TO_FLOAT16(1.0))); +} + +/// Convert a half-precision FfxUInt32 value between 0 and 1 to a half-precision FfxFloat32. +/// +/// @param [in] x The value to converted to a half-precision FfxFloat32. +/// +/// @returns +/// The converted half-precision FfxFloat32 value. +/// +/// @ingroup GPU +FfxFloat16x2 ffxZeroOneUint16x2ToFloat16x2(FfxUInt16x2 x) +{ + return FFX_TO_FLOAT16X2(x * FFX_TO_UINT16X2(FfxUInt16x2(FFX_TO_FLOAT16(1.0), FFX_TO_FLOAT16(1.0)))); +} + +/// Convert a half-precision FfxUInt32 value between 0 and 1 to a half-precision FfxFloat32. +/// +/// @param [in] x The value to converted to a half-precision FfxFloat32. +/// +/// @returns +/// The converted half-precision FfxFloat32 value. +/// +/// @ingroup GPU +FfxFloat16x3 ffxZeroOneUint16x3ToFloat16x3(FfxUInt16x3 x) +{ + return FFX_TO_FLOAT16X3(x * FFX_TO_UINT16X3(FfxUInt16x3(FFX_TO_FLOAT16(1.0), FFX_TO_FLOAT16(1.0), FFX_TO_FLOAT16(1.0)))); +} + +/// Convert a half-precision FfxUInt32 value between 0 and 1 to a half-precision FfxFloat32. +/// +/// @param [in] x The value to converted to a half-precision FfxFloat32. +/// +/// @returns +/// The converted half-precision FfxFloat32 value. +/// +/// @ingroup GPU +FfxFloat16x4 ffxZeroOneUint16x4ToFloat16x4(FfxUInt16x4 x) +{ + return FFX_TO_FLOAT16X4(x * FFX_TO_UINT16X4(FfxUInt16x4(FFX_TO_FLOAT16(1.0), FFX_TO_FLOAT16(1.0), FFX_TO_FLOAT16(1.0), FFX_TO_FLOAT16(1.0)))); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPU +FfxFloat16 ffxZeroOneAndHalf(FfxFloat16 x, FfxFloat16 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPU +FfxFloat16x2 ffxZeroOneAndHalf(FfxFloat16x2 x, FfxFloat16x2 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPU +FfxFloat16x3 ffxZeroOneAndHalf(FfxFloat16x3 x, FfxFloat16x3 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPU +FfxFloat16x4 ffxZeroOneAndHalf(FfxFloat16x4 x, FfxFloat16x4 y) +{ + return min(x, y); +} + +/// Conditional free logic AND NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND NOT operator. +/// @param [in] y The second value to be fed into the AND NOT operator. +/// +/// @returns +/// Result of the AND NOT operation. +/// +/// @ingroup GPU +FfxFloat16 ffxSignedZeroOneAndOrHalf(FfxFloat16 x, FfxFloat16 y) +{ + return (-x) * y + FFX_BROADCAST_FLOAT16(1.0); +} + +/// Conditional free logic AND NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND NOT operator. +/// @param [in] y The second value to be fed into the AND NOT operator. +/// +/// @returns +/// Result of the AND NOT operation. +/// +/// @ingroup GPU +FfxFloat16x2 ffxSignedZeroOneAndOrHalf(FfxFloat16x2 x, FfxFloat16x2 y) +{ + return (-x) * y + FFX_BROADCAST_FLOAT16X2(1.0); +} + +/// Conditional free logic AND NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND NOT operator. +/// @param [in] y The second value to be fed into the AND NOT operator. +/// +/// @returns +/// Result of the AND NOT operation. +/// +/// @ingroup GPU +FfxFloat16x3 ffxSignedZeroOneAndOrHalf(FfxFloat16x3 x, FfxFloat16x3 y) +{ + return (-x) * y + FFX_BROADCAST_FLOAT16X3(1.0); +} + +/// Conditional free logic AND NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND NOT operator. +/// @param [in] y The second value to be fed into the AND NOT operator. +/// +/// @returns +/// Result of the AND NOT operation. +/// +/// @ingroup GPU +FfxFloat16x4 ffxSignedZeroOneAndOrHalf(FfxFloat16x4 x, FfxFloat16x4 y) +{ + return (-x) * y + FFX_BROADCAST_FLOAT16X4(1.0); +} + +/// Conditional free logic AND operation using two half-precision values followed by +/// a NOT operation using the resulting value and a third half-precision value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat16 ffxZeroOneAndOrHalf(FfxFloat16 x, FfxFloat16 y, FfxFloat16 z) +{ + return ffxSaturate(x * y + z); +} + +/// Conditional free logic AND operation using two half-precision values followed by +/// a NOT operation using the resulting value and a third half-precision value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat16x2 ffxZeroOneAndOrHalf(FfxFloat16x2 x, FfxFloat16x2 y, FfxFloat16x2 z) +{ + return ffxSaturate(x * y + z); +} + +/// Conditional free logic AND operation using two half-precision values followed by +/// a NOT operation using the resulting value and a third half-precision value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat16x3 ffxZeroOneAndOrHalf(FfxFloat16x3 x, FfxFloat16x3 y, FfxFloat16x3 z) +{ + return ffxSaturate(x * y + z); +} + +/// Conditional free logic AND operation using two half-precision values followed by +/// a NOT operation using the resulting value and a third half-precision value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat16x4 ffxZeroOneAndOrHalf(FfxFloat16x4 x, FfxFloat16x4 y, FfxFloat16x4 z) +{ + return ffxSaturate(x * y + z); +} + +/// Given a half-precision value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPU +FfxFloat16 ffxZeroOneIsGreaterThanZeroHalf(FfxFloat16 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16(FFX_POSITIVE_INFINITY_HALF)); +} + +/// Given a half-precision value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPU +FfxFloat16x2 ffxZeroOneIsGreaterThanZeroHalf(FfxFloat16x2 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16X2(FFX_POSITIVE_INFINITY_HALF)); +} + +/// Given a half-precision value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPU +FfxFloat16x3 ffxZeroOneIsGreaterThanZeroHalf(FfxFloat16x3 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16X3(FFX_POSITIVE_INFINITY_HALF)); +} + +/// Given a half-precision value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPU +FfxFloat16x4 ffxZeroOneIsGreaterThanZeroHalf(FfxFloat16x4 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16X4(FFX_POSITIVE_INFINITY_HALF)); +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat16 ffxZeroOneNotHalf(FfxFloat16 x) +{ + return FFX_BROADCAST_FLOAT16(1.0) - x; +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat16x2 ffxZeroOneNotHalf(FfxFloat16x2 x) +{ + return FFX_BROADCAST_FLOAT16X2(1.0) - x; +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat16x3 ffxZeroOneNotHalf(FfxFloat16x3 x) +{ + return FFX_BROADCAST_FLOAT16X3(1.0) - x; +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPU +FfxFloat16x4 ffxZeroOneNotHalf(FfxFloat16x4 x) +{ + return FFX_BROADCAST_FLOAT16X4(1.0) - x; +} + +/// Conditional free logic OR operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxFloat16 ffxZeroOneOrHalf(FfxFloat16 x, FfxFloat16 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxFloat16x2 ffxZeroOneOrHalf(FfxFloat16x2 x, FfxFloat16x2 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxFloat16x3 ffxZeroOneOrHalf(FfxFloat16x3 x, FfxFloat16x3 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPU +FfxFloat16x4 ffxZeroOneOrHalf(FfxFloat16x4 x, FfxFloat16x4 y) +{ + return max(x, y); +} + +/// Choose between two half-precision FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPU +FfxFloat16 ffxZeroOneSelectHalf(FfxFloat16 x, FfxFloat16 y, FfxFloat16 z) +{ + FfxFloat16 r = (-x) * z + z; + return x * y + r; +} + +/// Choose between two half-precision FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPU +FfxFloat16x2 ffxZeroOneSelectHalf(FfxFloat16x2 x, FfxFloat16x2 y, FfxFloat16x2 z) +{ + FfxFloat16x2 r = (-x) * z + z; + return x * y + r; +} + +/// Choose between two half-precision FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPU +FfxFloat16x3 ffxZeroOneSelectHalf(FfxFloat16x3 x, FfxFloat16x3 y, FfxFloat16x3 z) +{ + FfxFloat16x3 r = (-x) * z + z; + return x * y + r; +} + +/// Choose between two half-precision FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPU +FfxFloat16x4 ffxZeroOneSelectHalf(FfxFloat16x4 x, FfxFloat16x4 y, FfxFloat16x4 z) +{ + FfxFloat16x4 r = (-x) * z + z; + return x * y + r; +} + +/// Given a half-precision value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPU +FfxFloat16 ffxZeroOneIsSignedHalf(FfxFloat16 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// Given a half-precision value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPU +FfxFloat16x2 ffxZeroOneIsSignedHalf(FfxFloat16x2 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16X2(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// Given a half-precision value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPU +FfxFloat16x3 ffxZeroOneIsSignedHalf(FfxFloat16x3 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16X3(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// Given a half-precision value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPU +FfxFloat16x4 ffxZeroOneIsSignedHalf(FfxFloat16x4 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16X4(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// Compute a Rec.709 color space. +/// +/// Rec.709 is used for some HDTVs. +/// +/// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +/// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +/// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +/// +/// @param [in] c The color to convert to Rec. 709. +/// +/// @returns +/// The color in Rec.709 space. +/// +/// @ingroup GPU +FfxFloat16 ffxRec709FromLinearHalf(FfxFloat16 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.018 * 4.5, 4.5, 0.45); + FfxFloat16x2 k = FfxFloat16x2(1.099, -0.099); + return clamp(j.x, c * j.y, pow(c, j.z) * k.x + k.y); +} + +/// Compute a Rec.709 color space. +/// +/// Rec.709 is used for some HDTVs. +/// +/// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +/// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +/// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +/// +/// @param [in] c The color to convert to Rec. 709. +/// +/// @returns +/// The color in Rec.709 space. +/// +/// @ingroup GPU +FfxFloat16x2 ffxRec709FromLinearHalf(FfxFloat16x2 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.018 * 4.5, 4.5, 0.45); + FfxFloat16x2 k = FfxFloat16x2(1.099, -0.099); + return clamp(j.xx, c * j.yy, pow(c, j.zz) * k.xx + k.yy); +} + +/// Compute a Rec.709 color space. +/// +/// Rec.709 is used for some HDTVs. +/// +/// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +/// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +/// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +/// +/// @param [in] c The color to convert to Rec. 709. +/// +/// @returns +/// The color in Rec.709 space. +/// +/// @ingroup GPU +FfxFloat16x3 ffxRec709FromLinearHalf(FfxFloat16x3 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.018 * 4.5, 4.5, 0.45); + FfxFloat16x2 k = FfxFloat16x2(1.099, -0.099); + return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); +} + +/// Compute a gamma value from a linear value. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// Note: 'rcpX' is '1/x', where the 'x' is what would be used in ffxLinearFromGammaHalf. +/// +/// @param [in] c The value to convert to gamma space from linear. +/// @param [in] rcpX The reciprocal of power value used for the gamma curve. +/// +/// @returns +/// A value in gamma space. +/// +/// @ingroup GPU +FfxFloat16 ffxGammaFromLinearHalf(FfxFloat16 c, FfxFloat16 rcpX) +{ + return pow(c, FFX_BROADCAST_FLOAT16(rcpX)); +} + +/// Compute a gamma value from a linear value. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// Note: 'rcpX' is '1/x', where the 'x' is what would be used in ffxLinearFromGammaHalf. +/// +/// @param [in] c The value to convert to gamma space from linear. +/// @param [in] rcpX The reciprocal of power value used for the gamma curve. +/// +/// @returns +/// A value in gamma space. +/// +/// @ingroup GPU +FfxFloat16x2 ffxGammaFromLinearHalf(FfxFloat16x2 c, FfxFloat16 rcpX) +{ + return pow(c, FFX_BROADCAST_FLOAT16X2(rcpX)); +} + +/// Compute a gamma value from a linear value. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// Note: 'rcpX' is '1/x', where the 'x' is what would be used in ffxLinearFromGammaHalf. +/// +/// @param [in] c The value to convert to gamma space from linear. +/// @param [in] rcpX The reciprocal of power value used for the gamma curve. +/// +/// @returns +/// A value in gamma space. +/// +/// @ingroup GPU +FfxFloat16x3 ffxGammaFromLinearHalf(FfxFloat16x3 c, FfxFloat16 rcpX) +{ + return pow(c, FFX_BROADCAST_FLOAT16X3(rcpX)); +} + +/// Compute an SRGB value from a linear value. +/// +/// @param [in] c The value to convert to SRGB from linear. +/// +/// @returns +/// A value in SRGB space. +/// +/// @ingroup GPU +FfxFloat16 ffxSrgbFromLinearHalf(FfxFloat16 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); + FfxFloat16x2 k = FfxFloat16x2(1.055, -0.055); + return clamp(j.x, c * j.y, pow(c, j.z) * k.x + k.y); +} + +/// Compute an SRGB value from a linear value. +/// +/// @param [in] c The value to convert to SRGB from linear. +/// +/// @returns +/// A value in SRGB space. +/// +/// @ingroup GPU +FfxFloat16x2 ffxSrgbFromLinearHalf(FfxFloat16x2 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); + FfxFloat16x2 k = FfxFloat16x2(1.055, -0.055); + return clamp(j.xx, c * j.yy, pow(c, j.zz) * k.xx + k.yy); +} + +/// Compute an SRGB value from a linear value. +/// +/// @param [in] c The value to convert to SRGB from linear. +/// +/// @returns +/// A value in SRGB space. +/// +/// @ingroup GPU +FfxFloat16x3 ffxSrgbFromLinearHalf(FfxFloat16x3 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); + FfxFloat16x2 k = FfxFloat16x2(1.055, -0.055); + return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); +} + +/// Compute the square root of a value. +/// +/// @param [in] c The value to compute the square root for. +/// +/// @returns +/// A square root of the input value. +/// +/// @ingroup GPU +FfxFloat16 ffxSquareRootHalf(FfxFloat16 c) +{ + return sqrt(c); +} + +/// Compute the square root of a value. +/// +/// @param [in] c The value to compute the square root for. +/// +/// @returns +/// A square root of the input value. +/// +/// @ingroup GPU +FfxFloat16x2 ffxSquareRootHalf(FfxFloat16x2 c) +{ + return sqrt(c); +} + +/// Compute the square root of a value. +/// +/// @param [in] c The value to compute the square root for. +/// +/// @returns +/// A square root of the input value. +/// +/// @ingroup GPU +FfxFloat16x3 ffxSquareRootHalf(FfxFloat16x3 c) +{ + return sqrt(c); +} + +/// Compute the cube root of a value. +/// +/// @param [in] c The value to compute the cube root for. +/// +/// @returns +/// A cube root of the input value. +/// +/// @ingroup GPU +FfxFloat16 ffxCubeRootHalf(FfxFloat16 c) +{ + return pow(c, FFX_BROADCAST_FLOAT16(1.0 / 3.0)); +} + +/// Compute the cube root of a value. +/// +/// @param [in] c The value to compute the cube root for. +/// +/// @returns +/// A cube root of the input value. +/// +/// @ingroup GPU +FfxFloat16x2 ffxCubeRootHalf(FfxFloat16x2 c) +{ + return pow(c, FFX_BROADCAST_FLOAT16X2(1.0 / 3.0)); +} + +/// Compute the cube root of a value. +/// +/// @param [in] c The value to compute the cube root for. +/// +/// @returns +/// A cube root of the input value. +/// +/// @ingroup GPU +FfxFloat16x3 ffxCubeRootHalf(FfxFloat16x3 c) +{ + return pow(c, FFX_BROADCAST_FLOAT16X3(1.0 / 3.0)); +} + +/// Compute a linear value from a REC.709 value. +/// +/// @param [in] c The value to convert to linear from REC.709. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat16 ffxLinearFromRec709Half(FfxFloat16 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.081 / 4.5, 1.0 / 4.5, 1.0 / 0.45); + FfxFloat16x2 k = FfxFloat16x2(1.0 / 1.099, 0.099 / 1.099); + return ffxZeroOneSelectHalf(ffxZeroOneIsSignedHalf(c - j.x), c * j.y, pow(c * k.x + k.y, j.z)); +} + +/// Compute a linear value from a REC.709 value. +/// +/// @param [in] c The value to convert to linear from REC.709. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat16x2 ffxLinearFromRec709Half(FfxFloat16x2 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.081 / 4.5, 1.0 / 4.5, 1.0 / 0.45); + FfxFloat16x2 k = FfxFloat16x2(1.0 / 1.099, 0.099 / 1.099); + return ffxZeroOneSelectHalf(ffxZeroOneIsSignedHalf(c - j.xx), c * j.yy, pow(c * k.xx + k.yy, j.zz)); +} + +/// Compute a linear value from a REC.709 value. +/// +/// @param [in] c The value to convert to linear from REC.709. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat16x3 ffxLinearFromRec709Half(FfxFloat16x3 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.081 / 4.5, 1.0 / 4.5, 1.0 / 0.45); + FfxFloat16x2 k = FfxFloat16x2(1.0 / 1.099, 0.099 / 1.099); + return ffxZeroOneSelectHalf(ffxZeroOneIsSignedHalf(c - j.xxx), c * j.yyy, pow(c * k.xxx + k.yyy, j.zzz)); +} + +/// Compute a linear value from a value in a gamma space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] c The value to convert to linear in gamma space. +/// @param [in] x The power value used for the gamma curve. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat16 ffxLinearFromGammaHalf(FfxFloat16 c, FfxFloat16 x) +{ + return pow(c, FFX_BROADCAST_FLOAT16(x)); +} + +/// Compute a linear value from a value in a gamma space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] c The value to convert to linear in gamma space. +/// @param [in] x The power value used for the gamma curve. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat16x2 ffxLinearFromGammaHalf(FfxFloat16x2 c, FfxFloat16 x) +{ + return pow(c, FFX_BROADCAST_FLOAT16X2(x)); +} + +/// Compute a linear value from a value in a gamma space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] c The value to convert to linear in gamma space. +/// @param [in] x The power value used for the gamma curve. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat16x3 ffxLinearFromGammaHalf(FfxFloat16x3 c, FfxFloat16 x) +{ + return pow(c, FFX_BROADCAST_FLOAT16X3(x)); +} + +/// Compute a linear value from a value in a SRGB space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] c The value to convert to linear in SRGB space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat16 ffxLinearFromSrgbHalf(FfxFloat16 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.04045 / 12.92, 1.0 / 12.92, 2.4); + FfxFloat16x2 k = FfxFloat16x2(1.0 / 1.055, 0.055 / 1.055); + return ffxZeroOneSelectHalf(ffxZeroOneIsSignedHalf(c - j.x), c * j.y, pow(c * k.x + k.y, j.z)); +} + +/// Compute a linear value from a value in a SRGB space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] c The value to convert to linear in SRGB space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat16x2 ffxLinearFromSrgbHalf(FfxFloat16x2 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.04045 / 12.92, 1.0 / 12.92, 2.4); + FfxFloat16x2 k = FfxFloat16x2(1.0 / 1.055, 0.055 / 1.055); + return ffxZeroOneSelectHalf(ffxZeroOneIsSignedHalf(c - j.xx), c * j.yy, pow(c * k.xx + k.yy, j.zz)); +} + +/// Compute a linear value from a value in a SRGB space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] c The value to convert to linear in SRGB space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPU +FfxFloat16x3 ffxLinearFromSrgbHalf(FfxFloat16x3 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.04045 / 12.92, 1.0 / 12.92, 2.4); + FfxFloat16x2 k = FfxFloat16x2(1.0 / 1.055, 0.055 / 1.055); + return ffxZeroOneSelectHalf(ffxZeroOneIsSignedHalf(c - j.xxx), c * j.yyy, pow(c * k.xxx + k.yyy, j.zzz)); +} + +/// A remapping of 64x1 to 8x8 imposing rotated 2x2 pixel quads in quad linear. +/// +/// 543210 +/// ====== +/// ..xxx. +/// yy...y +/// +/// @param [in] a The input 1D coordinates to remap. +/// +/// @returns +/// The remapped 2D coordinates. +/// +/// @ingroup GPU +FfxUInt16x2 ffxRemapForQuadHalf(FfxUInt32 a) +{ + return FfxUInt16x2(bitfieldExtract(a, 1u, 3u), bitfieldInsertMask(bitfieldExtract(a, 3u, 3u), a, 1u)); +} + +/// A helper function performing a remap 64x1 to 8x8 remapping which is necessary for 2D wave reductions. +/// +/// The 64-wide lane indices to 8x8 remapping is performed as follows: +/// +/// 00 01 08 09 10 11 18 19 +/// 02 03 0a 0b 12 13 1a 1b +/// 04 05 0c 0d 14 15 1c 1d +/// 06 07 0e 0f 16 17 1e 1f +/// 20 21 28 29 30 31 38 39 +/// 22 23 2a 2b 32 33 3a 3b +/// 24 25 2c 2d 34 35 3c 3d +/// 26 27 2e 2f 36 37 3e 3f +/// +/// @param [in] a The input 1D coordinate to remap. +/// +/// @returns +/// The remapped 2D coordinates. +/// +/// @ingroup GPU +FfxUInt16x2 ffxRemapForWaveReductionHalf(FfxUInt32 a) +{ + return FfxUInt16x2(bitfieldInsertMask(bitfieldExtract(a, 2u, 3u), a, 1u), bitfieldInsertMask(bitfieldExtract(a, 3u, 3u), bitfieldExtract(a, 1u, 2u), 2u)); +} + +#endif // FFX_HALF diff --git a/thirdparty/amd-fsr2/shaders/ffx_core_hlsl.h b/thirdparty/amd-fsr2/shaders/ffx_core_hlsl.h new file mode 100644 index 00000000000..ad4ff6552d1 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_core_hlsl.h @@ -0,0 +1,1502 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +/// A define for abstracting shared memory between shading languages. +/// +/// @ingroup GPU +#define FFX_GROUPSHARED groupshared + +/// A define for abstracting compute memory barriers between shading languages. +/// +/// @ingroup GPU +#define FFX_GROUP_MEMORY_BARRIER GroupMemoryBarrierWithGroupSync + +/// A define added to accept static markup on functions to aid CPU/GPU portability of code. +/// +/// @ingroup GPU +#define FFX_STATIC static + +/// A define for abstracting loop unrolling between shading languages. +/// +/// @ingroup GPU +#define FFX_UNROLL [unroll] + +/// A define for abstracting a 'greater than' comparison operator between two types. +/// +/// @ingroup GPU +#define FFX_GREATER_THAN(x, y) x > y + +/// A define for abstracting a 'greater than or equal' comparison operator between two types. +/// +/// @ingroup GPU +#define FFX_GREATER_THAN_EQUAL(x, y) x >= y + +/// A define for abstracting a 'less than' comparison operator between two types. +/// +/// @ingroup GPU +#define FFX_LESS_THAN(x, y) x < y + +/// A define for abstracting a 'less than or equal' comparison operator between two types. +/// +/// @ingroup GPU +#define FFX_LESS_THAN_EQUAL(x, y) x <= y + +/// A define for abstracting an 'equal' comparison operator between two types. +/// +/// @ingroup GPU +#define FFX_EQUAL(x, y) x == y + +/// A define for abstracting a 'not equal' comparison operator between two types. +/// +/// @ingroup GPU +#define FFX_NOT_EQUAL(x, y) x != y + +/// Broadcast a scalar value to a 1-dimensional floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_FLOAT32(x) FfxFloat32(x) + +/// Broadcast a scalar value to a 2-dimensional floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_FLOAT32X2(x) FfxFloat32(x) + +/// Broadcast a scalar value to a 3-dimensional floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_FLOAT32X3(x) FfxFloat32(x) + +/// Broadcast a scalar value to a 4-dimensional floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_FLOAT32X4(x) FfxFloat32(x) + +/// Broadcast a scalar value to a 1-dimensional unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_UINT32(x) FfxUInt32(x) + +/// Broadcast a scalar value to a 2-dimensional unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_UINT32X2(x) FfxUInt32(x) + +/// Broadcast a scalar value to a 4-dimensional unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_UINT32X3(x) FfxUInt32(x) + +/// Broadcast a scalar value to a 4-dimensional unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_UINT32X4(x) FfxUInt32(x) + +/// Broadcast a scalar value to a 1-dimensional signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_INT32(x) FfxInt32(x) + +/// Broadcast a scalar value to a 2-dimensional signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_INT32X2(x) FfxInt32(x) + +/// Broadcast a scalar value to a 3-dimensional signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_INT32X3(x) FfxInt32(x) + +/// Broadcast a scalar value to a 4-dimensional signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_INT32X4(x) FfxInt32(x) + +/// Broadcast a scalar value to a 1-dimensional half-precision floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_FLOAT16(a) FFX_MIN16_F(a) + +/// Broadcast a scalar value to a 2-dimensional half-precision floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_FLOAT16X2(a) FFX_MIN16_F(a) + +/// Broadcast a scalar value to a 3-dimensional half-precision floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_FLOAT16X3(a) FFX_MIN16_F(a) + +/// Broadcast a scalar value to a 4-dimensional half-precision floating point vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_FLOAT16X4(a) FFX_MIN16_F(a) + +/// Broadcast a scalar value to a 1-dimensional half-precision unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_UINT16(a) FFX_MIN16_U(a) + +/// Broadcast a scalar value to a 2-dimensional half-precision unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_UINT16X2(a) FFX_MIN16_U(a) + +/// Broadcast a scalar value to a 3-dimensional half-precision unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_UINT16X3(a) FFX_MIN16_U(a) + +/// Broadcast a scalar value to a 4-dimensional half-precision unsigned integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_UINT16X4(a) FFX_MIN16_U(a) + +/// Broadcast a scalar value to a 1-dimensional half-precision signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_INT16(a) FFX_MIN16_I(a) + +/// Broadcast a scalar value to a 2-dimensional half-precision signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_INT16X2(a) FFX_MIN16_I(a) + +/// Broadcast a scalar value to a 3-dimensional half-precision signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_INT16X3(a) FFX_MIN16_I(a) + +/// Broadcast a scalar value to a 4-dimensional half-precision signed integer vector. +/// +/// @ingroup GPU +#define FFX_BROADCAST_MIN_INT16X4(a) FFX_MIN16_I(a) + +/// Pack 2x32-bit floating point values in a single 32bit value. +/// +/// This function first converts each component of value into their nearest 16-bit floating +/// point representation, and then stores the X and Y components in the lower and upper 16 bits of the +/// 32bit unsigned integer respectively. +/// +/// @param [in] value A 2-dimensional floating point value to convert and pack. +/// +/// @returns +/// A packed 32bit value containing 2 16bit floating point values. +/// +/// @ingroup HLSL +FfxUInt32 packHalf2x16(FfxFloat32x2 value) +{ + return f32tof16(value.x) | (f32tof16(value.y) << 16); +} + +/// Broadcast a scalar value to a 2-dimensional floating point vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 2-dimensional floating point vector with value in each component. +/// +/// @ingroup HLSL +FfxFloat32x2 ffxBroadcast2(FfxFloat32 value) +{ + return FfxFloat32x2(value, value); +} + +/// Broadcast a scalar value to a 3-dimensional floating point vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 3-dimensional floating point vector with value in each component. +/// +/// @ingroup HLSL +FfxFloat32x3 ffxBroadcast3(FfxFloat32 value) +{ + return FfxFloat32x3(value, value, value); +} + +/// Broadcast a scalar value to a 4-dimensional floating point vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 4-dimensional floating point vector with value in each component. +/// +/// @ingroup HLSL +FfxFloat32x4 ffxBroadcast4(FfxFloat32 value) +{ + return FfxFloat32x4(value, value, value, value); +} + +/// Broadcast a scalar value to a 2-dimensional signed integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 2-dimensional signed integer vector with value in each component. +/// +/// @ingroup HLSL +FfxInt32x2 ffxBroadcast2(FfxInt32 value) +{ + return FfxInt32x2(value, value); +} + +/// Broadcast a scalar value to a 3-dimensional signed integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 3-dimensional signed integer vector with value in each component. +/// +/// @ingroup HLSL +FfxUInt32x3 ffxBroadcast3(FfxInt32 value) +{ + return FfxUInt32x3(value, value, value); +} + +/// Broadcast a scalar value to a 4-dimensional signed integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 4-dimensional signed integer vector with value in each component. +/// +/// @ingroup HLSL +FfxInt32x4 ffxBroadcast4(FfxInt32 value) +{ + return FfxInt32x4(value, value, value, value); +} + +/// Broadcast a scalar value to a 2-dimensional unsigned integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 2-dimensional unsigned integer vector with value in each component. +/// +/// @ingroup HLSL +FfxUInt32x2 ffxBroadcast2(FfxUInt32 value) +{ + return FfxUInt32x2(value, value); +} + +/// Broadcast a scalar value to a 3-dimensional unsigned integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 3-dimensional unsigned integer vector with value in each component. +/// +/// @ingroup HLSL +FfxUInt32x3 ffxBroadcast3(FfxUInt32 value) +{ + return FfxUInt32x3(value, value, value); +} + +/// Broadcast a scalar value to a 4-dimensional unsigned integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 4-dimensional unsigned integer vector with value in each component. +/// +/// @ingroup HLSL +FfxUInt32x4 ffxBroadcast4(FfxUInt32 value) +{ + return FfxUInt32x4(value, value, value, value); +} + +FfxUInt32 bitfieldExtract(FfxUInt32 src, FfxUInt32 off, FfxUInt32 bits) +{ + FfxUInt32 mask = (1u << bits) - 1; + return (src >> off) & mask; +} + +FfxUInt32 bitfieldInsert(FfxUInt32 src, FfxUInt32 ins, FfxUInt32 mask) +{ + return (ins & mask) | (src & (~mask)); +} + +FfxUInt32 bitfieldInsertMask(FfxUInt32 src, FfxUInt32 ins, FfxUInt32 bits) +{ + FfxUInt32 mask = (1u << bits) - 1; + return (ins & mask) | (src & (~mask)); +} + +/// Interprets the bit pattern of x as an unsigned integer. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as an unsigned integer. +/// +/// @ingroup HLSL +FfxUInt32 ffxAsUInt32(FfxFloat32 x) +{ + return asuint(x); +} + +/// Interprets the bit pattern of x as an unsigned integer. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as an unsigned integer. +/// +/// @ingroup HLSL +FfxUInt32x2 ffxAsUInt32(FfxFloat32x2 x) +{ + return asuint(x); +} + +/// Interprets the bit pattern of x as an unsigned integer. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as an unsigned integer. +/// +/// @ingroup HLSL +FfxUInt32x3 ffxAsUInt32(FfxFloat32x3 x) +{ + return asuint(x); +} + +/// Interprets the bit pattern of x as an unsigned integer. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as an unsigned integer. +/// +/// @ingroup HLSL +FfxUInt32x4 ffxAsUInt32(FfxFloat32x4 x) +{ + return asuint(x); +} + +/// Interprets the bit pattern of x as a floating-point number. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as a floating-point number. +/// +/// @ingroup HLSL +FfxFloat32 ffxAsFloat(FfxUInt32 x) +{ + return asfloat(x); +} + +/// Interprets the bit pattern of x as a floating-point number. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as a floating-point number. +/// +/// @ingroup HLSL +FfxFloat32x2 ffxAsFloat(FfxUInt32x2 x) +{ + return asfloat(x); +} + +/// Interprets the bit pattern of x as a floating-point number. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as a floating-point number. +/// +/// @ingroup HLSL +FfxFloat32x3 ffxAsFloat(FfxUInt32x3 x) +{ + return asfloat(x); +} + +/// Interprets the bit pattern of x as a floating-point number. +/// +/// @param [in] value The input value. +/// +/// @returns +/// The input interpreted as a floating-point number. +/// +/// @ingroup HLSL +FfxFloat32x4 ffxAsFloat(FfxUInt32x4 x) +{ + return asfloat(x); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the HLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup HLSL +FfxFloat32 ffxLerp(FfxFloat32 x, FfxFloat32 y, FfxFloat32 t) +{ + return lerp(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the HLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup HLSL +FfxFloat32x2 ffxLerp(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32 t) +{ + return lerp(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the HLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup HLSL +FfxFloat32x2 ffxLerp(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 t) +{ + return lerp(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the HLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup HLSL +FfxFloat32x3 ffxLerp(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32 t) +{ + return lerp(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the HLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup HLSL +FfxFloat32x3 ffxLerp(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 t) +{ + return lerp(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the HLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup HLSL +FfxFloat32x4 ffxLerp(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32 t) +{ + return lerp(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the HLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup HLSL +FfxFloat32x4 ffxLerp(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 t) +{ + return lerp(x, y, t); +} + +/// Clamp a value to a [0..1] range. +/// +/// @param [in] x The value to clamp to [0..1] range. +/// +/// @returns +/// The clamped version of x. +/// +/// @ingroup HLSL +FfxFloat32 ffxSaturate(FfxFloat32 x) +{ + return saturate(x); +} + +/// Clamp a value to a [0..1] range. +/// +/// @param [in] x The value to clamp to [0..1] range. +/// +/// @returns +/// The clamped version of x. +/// +/// @ingroup HLSL +FfxFloat32x2 ffxSaturate(FfxFloat32x2 x) +{ + return saturate(x); +} + +/// Clamp a value to a [0..1] range. +/// +/// @param [in] x The value to clamp to [0..1] range. +/// +/// @returns +/// The clamped version of x. +/// +/// @ingroup HLSL +FfxFloat32x3 ffxSaturate(FfxFloat32x3 x) +{ + return saturate(x); +} + +/// Clamp a value to a [0..1] range. +/// +/// @param [in] x The value to clamp to [0..1] range. +/// +/// @returns +/// The clamped version of x. +/// +/// @ingroup HLSL +FfxFloat32x4 ffxSaturate(FfxFloat32x4 x) +{ + return saturate(x); +} + +/// Compute the factional part of a decimal value. +/// +/// This function calculates x - floor(x). Where floor is the intrinsic HLSL function. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. It is +/// worth further noting that this function is intentionally distinct from the HLSL frac intrinsic +/// function. +/// +/// @param [in] x The value to compute the fractional part from. +/// +/// @returns +/// The fractional part of x. +/// +/// @ingroup HLSL +FfxFloat32 ffxFract(FfxFloat32 x) +{ + return x - floor(x); +} + +/// Compute the factional part of a decimal value. +/// +/// This function calculates x - floor(x). Where floor is the intrinsic HLSL function. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. It is +/// worth further noting that this function is intentionally distinct from the HLSL frac intrinsic +/// function. +/// +/// @param [in] x The value to compute the fractional part from. +/// +/// @returns +/// The fractional part of x. +/// +/// @ingroup HLSL +FfxFloat32x2 ffxFract(FfxFloat32x2 x) +{ + return x - floor(x); +} + +/// Compute the factional part of a decimal value. +/// +/// This function calculates x - floor(x). Where floor is the intrinsic HLSL function. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. It is +/// worth further noting that this function is intentionally distinct from the HLSL frac intrinsic +/// function. +/// +/// @param [in] x The value to compute the fractional part from. +/// +/// @returns +/// The fractional part of x. +/// +/// @ingroup HLSL +FfxFloat32x3 ffxFract(FfxFloat32x3 x) +{ + return x - floor(x); +} + +/// Compute the factional part of a decimal value. +/// +/// This function calculates x - floor(x). Where floor is the intrinsic HLSL function. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. It is +/// worth further noting that this function is intentionally distinct from the HLSL frac intrinsic +/// function. +/// +/// @param [in] x The value to compute the fractional part from. +/// +/// @returns +/// The fractional part of x. +/// +/// @ingroup HLSL +FfxFloat32x4 ffxFract(FfxFloat32x4 x) +{ + return x - floor(x); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSL +FfxFloat32 ffxMax3(FfxFloat32 x, FfxFloat32 y, FfxFloat32 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSL +FfxFloat32x2 ffxMax3(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSL +FfxFloat32x3 ffxMax3(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSL +FfxFloat32x4 ffxMax3(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSL +FfxUInt32 ffxMax3(FfxUInt32 x, FfxUInt32 y, FfxUInt32 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSL +FfxUInt32x2 ffxMax3(FfxUInt32x2 x, FfxUInt32x2 y, FfxUInt32x2 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSL +FfxUInt32x3 ffxMax3(FfxUInt32x3 x, FfxUInt32x3 y, FfxUInt32x3 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSL +FfxUInt32x4 ffxMax3(FfxUInt32x4 x, FfxUInt32x4 y, FfxUInt32x4 z) +{ + return max(x, max(y, z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSL +FfxFloat32 ffxMed3(FfxFloat32 x, FfxFloat32 y, FfxFloat32 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSL +FfxFloat32x2 ffxMed3(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSL +FfxFloat32x3 ffxMed3(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSL +FfxFloat32x4 ffxMed3(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSL +FfxInt32 ffxMed3(FfxInt32 x, FfxInt32 y, FfxInt32 z) +{ + return max(min(x, y), min(max(x, y), z)); + // return min(max(min(y, z), x), max(y, z)); + // return max(max(x, y), z) == x ? max(y, z) : (max(max(x, y), z) == y ? max(x, z) : max(x, y)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSL +FfxInt32x2 ffxMed3(FfxInt32x2 x, FfxInt32x2 y, FfxInt32x2 z) +{ + return max(min(x, y), min(max(x, y), z)); + // return min(max(min(y, z), x), max(y, z)); + // return max(max(x, y), z) == x ? max(y, z) : (max(max(x, y), z) == y ? max(x, z) : max(x, y)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSL +FfxInt32x3 ffxMed3(FfxInt32x3 x, FfxInt32x3 y, FfxInt32x3 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_I32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSL +FfxInt32x4 ffxMed3(FfxInt32x4 x, FfxInt32x4 y, FfxInt32x4 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_I32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSL +FfxFloat32 ffxMin3(FfxFloat32 x, FfxFloat32 y, FfxFloat32 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_I32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSL +FfxFloat32x2 ffxMin3(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_I32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSL +FfxFloat32x3 ffxMin3(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSL +FfxFloat32x4 ffxMin3(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSL +FfxUInt32 ffxMin3(FfxUInt32 x, FfxUInt32 y, FfxUInt32 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSL +FfxUInt32x2 ffxMin3(FfxUInt32x2 x, FfxUInt32x2 y, FfxUInt32x2 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSL +FfxUInt32x3 ffxMin3(FfxUInt32x3 x, FfxUInt32x3 y, FfxUInt32x3 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSL +FfxUInt32x4 ffxMin3(FfxUInt32x4 x, FfxUInt32x4 y, FfxUInt32x4 z) +{ + return min(x, min(y, z)); +} + + +FfxUInt32 AShrSU1(FfxUInt32 a, FfxUInt32 b) +{ + return FfxUInt32(FfxInt32(a) >> FfxInt32(b)); +} + +//============================================================================================================================== +// HLSL HALF +//============================================================================================================================== +#if FFX_HALF + +//============================================================================================================================== +// Need to use manual unpack to get optimal execution (don't use packed types in buffers directly). +// Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/ +FFX_MIN16_F2 ffxUint32ToFloat16x2(FfxUInt32 x) +{ + FfxFloat32x2 t = f16tof32(FfxUInt32x2(x & 0xFFFF, x >> 16)); + return FFX_MIN16_F2(t); +} +FFX_MIN16_F4 ffxUint32x2ToFloat16x4(FfxUInt32x2 x) +{ + return FFX_MIN16_F4(ffxUint32ToFloat16x2(x.x), ffxUint32ToFloat16x2(x.y)); +} +FFX_MIN16_U2 ffxUint32ToUint16x2(FfxUInt32 x) +{ + FfxUInt32x2 t = FfxUInt32x2(x & 0xFFFF, x >> 16); + return FFX_MIN16_U2(t); +} +FFX_MIN16_U4 ffxUint32x2ToUint16x4(FfxUInt32x2 x) +{ + return FFX_MIN16_U4(ffxUint32ToUint16x2(x.x), ffxUint32ToUint16x2(x.y)); +} +#define FFX_UINT32_TO_FLOAT16X2(x) ffxUint32ToFloat16x2(FfxUInt32(x)) +#define FFX_UINT32X2_TO_FLOAT16X4(x) ffxUint32x2ToFloat16x4(FfxUInt32x2(x)) +#define FFX_UINT32_TO_UINT16X2(x) ffxUint32ToUint16x2(FfxUInt32(x)) +#define FFX_UINT32X2_TO_UINT16X4(x) ffxUint32x2ToUint16x4(FfxUInt32x2(x)) +//------------------------------------------------------------------------------------------------------------------------------ +FfxUInt32 FFX_MIN16_F2ToUint32(FFX_MIN16_F2 x) +{ + return f32tof16(x.x) + (f32tof16(x.y) << 16); +} +FfxUInt32x2 FFX_MIN16_F4ToUint32x2(FFX_MIN16_F4 x) +{ + return FfxUInt32x2(FFX_MIN16_F2ToUint32(x.xy), FFX_MIN16_F2ToUint32(x.zw)); +} +FfxUInt32 FFX_MIN16_U2ToUint32(FFX_MIN16_U2 x) +{ + return FfxUInt32(x.x) + (FfxUInt32(x.y) << 16); +} +FfxUInt32x2 FFX_MIN16_U4ToUint32x2(FFX_MIN16_U4 x) +{ + return FfxUInt32x2(FFX_MIN16_U2ToUint32(x.xy), FFX_MIN16_U2ToUint32(x.zw)); +} +#define FFX_FLOAT16X2_TO_UINT32(x) FFX_MIN16_F2ToUint32(FFX_MIN16_F2(x)) +#define FFX_FLOAT16X4_TO_UINT32X2(x) FFX_MIN16_F4ToUint32x2(FFX_MIN16_F4(x)) +#define FFX_UINT16X2_TO_UINT32(x) FFX_MIN16_U2ToUint32(FFX_MIN16_U2(x)) +#define FFX_UINT16X4_TO_UINT32X2(x) FFX_MIN16_U4ToUint32x2(FFX_MIN16_U4(x)) + +#if defined(FFX_HLSL_6_2) && !defined(FFX_NO_16_BIT_CAST) +#define FFX_TO_UINT16(x) asuint16(x) +#define FFX_TO_UINT16X2(x) asuint16(x) +#define FFX_TO_UINT16X3(x) asuint16(x) +#define FFX_TO_UINT16X4(x) asuint16(x) +#else +#define FFX_TO_UINT16(a) FFX_MIN16_U(f32tof16(FfxFloat32(a))) +#define FFX_TO_UINT16X2(a) FFX_MIN16_U2(FFX_TO_UINT16((a).x), FFX_TO_UINT16((a).y)) +#define FFX_TO_UINT16X3(a) FFX_MIN16_U3(FFX_TO_UINT16((a).x), FFX_TO_UINT16((a).y), FFX_TO_UINT16((a).z)) +#define FFX_TO_UINT16X4(a) FFX_MIN16_U4(FFX_TO_UINT16((a).x), FFX_TO_UINT16((a).y), FFX_TO_UINT16((a).z), FFX_TO_UINT16((a).w)) +#endif // #if defined(FFX_HLSL_6_2) && !defined(FFX_NO_16_BIT_CAST) + +#if defined(FFX_HLSL_6_2) && !defined(FFX_NO_16_BIT_CAST) +#define FFX_TO_FLOAT16(x) asfloat16(x) +#define FFX_TO_FLOAT16X2(x) asfloat16(x) +#define FFX_TO_FLOAT16X3(x) asfloat16(x) +#define FFX_TO_FLOAT16X4(x) asfloat16(x) +#else +#define FFX_TO_FLOAT16(a) FFX_MIN16_F(f16tof32(FfxUInt32(a))) +#define FFX_TO_FLOAT16X2(a) FFX_MIN16_F2(FFX_TO_FLOAT16((a).x), FFX_TO_FLOAT16((a).y)) +#define FFX_TO_FLOAT16X3(a) FFX_MIN16_F3(FFX_TO_FLOAT16((a).x), FFX_TO_FLOAT16((a).y), FFX_TO_FLOAT16((a).z)) +#define FFX_TO_FLOAT16X4(a) FFX_MIN16_F4(FFX_TO_FLOAT16((a).x), FFX_TO_FLOAT16((a).y), FFX_TO_FLOAT16((a).z), FFX_TO_FLOAT16((a).w)) +#endif // #if defined(FFX_HLSL_6_2) && !defined(FFX_NO_16_BIT_CAST) + +//============================================================================================================================== +#define FFX_BROADCAST_FLOAT16(a) FFX_MIN16_F(a) +#define FFX_BROADCAST_FLOAT16X2(a) FFX_MIN16_F(a) +#define FFX_BROADCAST_FLOAT16X3(a) FFX_MIN16_F(a) +#define FFX_BROADCAST_FLOAT16X4(a) FFX_MIN16_F(a) + +//------------------------------------------------------------------------------------------------------------------------------ +#define FFX_BROADCAST_INT16(a) FFX_MIN16_I(a) +#define FFX_BROADCAST_INT16X2(a) FFX_MIN16_I(a) +#define FFX_BROADCAST_INT16X3(a) FFX_MIN16_I(a) +#define FFX_BROADCAST_INT16X4(a) FFX_MIN16_I(a) + +//------------------------------------------------------------------------------------------------------------------------------ +#define FFX_BROADCAST_UINT16(a) FFX_MIN16_U(a) +#define FFX_BROADCAST_UINT16X2(a) FFX_MIN16_U(a) +#define FFX_BROADCAST_UINT16X3(a) FFX_MIN16_U(a) +#define FFX_BROADCAST_UINT16X4(a) FFX_MIN16_U(a) + +//============================================================================================================================== +FFX_MIN16_U ffxAbsHalf(FFX_MIN16_U a) +{ + return FFX_MIN16_U(abs(FFX_MIN16_I(a))); +} +FFX_MIN16_U2 ffxAbsHalf(FFX_MIN16_U2 a) +{ + return FFX_MIN16_U2(abs(FFX_MIN16_I2(a))); +} +FFX_MIN16_U3 ffxAbsHalf(FFX_MIN16_U3 a) +{ + return FFX_MIN16_U3(abs(FFX_MIN16_I3(a))); +} +FFX_MIN16_U4 ffxAbsHalf(FFX_MIN16_U4 a) +{ + return FFX_MIN16_U4(abs(FFX_MIN16_I4(a))); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxClampHalf(FFX_MIN16_F x, FFX_MIN16_F n, FFX_MIN16_F m) +{ + return max(n, min(x, m)); +} +FFX_MIN16_F2 ffxClampHalf(FFX_MIN16_F2 x, FFX_MIN16_F2 n, FFX_MIN16_F2 m) +{ + return max(n, min(x, m)); +} +FFX_MIN16_F3 ffxClampHalf(FFX_MIN16_F3 x, FFX_MIN16_F3 n, FFX_MIN16_F3 m) +{ + return max(n, min(x, m)); +} +FFX_MIN16_F4 ffxClampHalf(FFX_MIN16_F4 x, FFX_MIN16_F4 n, FFX_MIN16_F4 m) +{ + return max(n, min(x, m)); +} +//------------------------------------------------------------------------------------------------------------------------------ +// V_FRACT_F16 (note DX frac() is different). +FFX_MIN16_F ffxFract(FFX_MIN16_F x) +{ + return x - floor(x); +} +FFX_MIN16_F2 ffxFract(FFX_MIN16_F2 x) +{ + return x - floor(x); +} +FFX_MIN16_F3 ffxFract(FFX_MIN16_F3 x) +{ + return x - floor(x); +} +FFX_MIN16_F4 ffxFract(FFX_MIN16_F4 x) +{ + return x - floor(x); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxLerp(FFX_MIN16_F x, FFX_MIN16_F y, FFX_MIN16_F a) +{ + return lerp(x, y, a); +} +FFX_MIN16_F2 ffxLerp(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F a) +{ + return lerp(x, y, a); +} +FFX_MIN16_F2 ffxLerp(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F2 a) +{ + return lerp(x, y, a); +} +FFX_MIN16_F3 ffxLerp(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F a) +{ + return lerp(x, y, a); +} +FFX_MIN16_F3 ffxLerp(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F3 a) +{ + return lerp(x, y, a); +} +FFX_MIN16_F4 ffxLerp(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F a) +{ + return lerp(x, y, a); +} +FFX_MIN16_F4 ffxLerp(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F4 a) +{ + return lerp(x, y, a); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxMax3Half(FFX_MIN16_F x, FFX_MIN16_F y, FFX_MIN16_F z) +{ + return max(x, max(y, z)); +} +FFX_MIN16_F2 ffxMax3Half(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F2 z) +{ + return max(x, max(y, z)); +} +FFX_MIN16_F3 ffxMax3Half(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F3 z) +{ + return max(x, max(y, z)); +} +FFX_MIN16_F4 ffxMax3Half(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F4 z) +{ + return max(x, max(y, z)); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxMin3Half(FFX_MIN16_F x, FFX_MIN16_F y, FFX_MIN16_F z) +{ + return min(x, min(y, z)); +} +FFX_MIN16_F2 ffxMin3Half(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F2 z) +{ + return min(x, min(y, z)); +} +FFX_MIN16_F3 ffxMin3Half(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F3 z) +{ + return min(x, min(y, z)); +} +FFX_MIN16_F4 ffxMin3Half(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F4 z) +{ + return min(x, min(y, z)); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxMed3Half(FFX_MIN16_F x, FFX_MIN16_F y, FFX_MIN16_F z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FFX_MIN16_F2 ffxMed3Half(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F2 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FFX_MIN16_F3 ffxMed3Half(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F3 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FFX_MIN16_F4 ffxMed3Half(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F4 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_I ffxMed3Half(FFX_MIN16_I x, FFX_MIN16_I y, FFX_MIN16_I z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FFX_MIN16_I2 ffxMed3Half(FFX_MIN16_I2 x, FFX_MIN16_I2 y, FFX_MIN16_I2 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FFX_MIN16_I3 ffxMed3Half(FFX_MIN16_I3 x, FFX_MIN16_I3 y, FFX_MIN16_I3 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FFX_MIN16_I4 ffxMed3Half(FFX_MIN16_I4 x, FFX_MIN16_I4 y, FFX_MIN16_I4 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxReciprocalHalf(FFX_MIN16_F x) +{ + return rcp(x); +} +FFX_MIN16_F2 ffxReciprocalHalf(FFX_MIN16_F2 x) +{ + return rcp(x); +} +FFX_MIN16_F3 ffxReciprocalHalf(FFX_MIN16_F3 x) +{ + return rcp(x); +} +FFX_MIN16_F4 ffxReciprocalHalf(FFX_MIN16_F4 x) +{ + return rcp(x); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxReciprocalSquareRootHalf(FFX_MIN16_F x) +{ + return rsqrt(x); +} +FFX_MIN16_F2 ffxReciprocalSquareRootHalf(FFX_MIN16_F2 x) +{ + return rsqrt(x); +} +FFX_MIN16_F3 ffxReciprocalSquareRootHalf(FFX_MIN16_F3 x) +{ + return rsqrt(x); +} +FFX_MIN16_F4 ffxReciprocalSquareRootHalf(FFX_MIN16_F4 x) +{ + return rsqrt(x); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxSaturate(FFX_MIN16_F x) +{ + return saturate(x); +} +FFX_MIN16_F2 ffxSaturate(FFX_MIN16_F2 x) +{ + return saturate(x); +} +FFX_MIN16_F3 ffxSaturate(FFX_MIN16_F3 x) +{ + return saturate(x); +} +FFX_MIN16_F4 ffxSaturate(FFX_MIN16_F4 x) +{ + return saturate(x); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_U ffxBitShiftRightHalf(FFX_MIN16_U a, FFX_MIN16_U b) +{ + return FFX_MIN16_U(FFX_MIN16_I(a) >> FFX_MIN16_I(b)); +} +FFX_MIN16_U2 ffxBitShiftRightHalf(FFX_MIN16_U2 a, FFX_MIN16_U2 b) +{ + return FFX_MIN16_U2(FFX_MIN16_I2(a) >> FFX_MIN16_I2(b)); +} +FFX_MIN16_U3 ffxBitShiftRightHalf(FFX_MIN16_U3 a, FFX_MIN16_U3 b) +{ + return FFX_MIN16_U3(FFX_MIN16_I3(a) >> FFX_MIN16_I3(b)); +} +FFX_MIN16_U4 ffxBitShiftRightHalf(FFX_MIN16_U4 a, FFX_MIN16_U4 b) +{ + return FFX_MIN16_U4(FFX_MIN16_I4(a) >> FFX_MIN16_I4(b)); +} +#endif // FFX_HALF + +//============================================================================================================================== +// HLSL WAVE +//============================================================================================================================== +#if defined(FFX_WAVE) +// Where 'x' must be a compile time literal. +FfxFloat32 AWaveXorF1(FfxFloat32 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxFloat32x2 AWaveXorF2(FfxFloat32x2 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxFloat32x3 AWaveXorF3(FfxFloat32x3 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxFloat32x4 AWaveXorF4(FfxFloat32x4 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxUInt32 AWaveXorU1(FfxUInt32 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxUInt32x2 AWaveXorU1(FfxUInt32x2 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxUInt32x3 AWaveXorU1(FfxUInt32x3 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxUInt32x4 AWaveXorU1(FfxUInt32x4 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} + +#if FFX_HALF +FfxFloat16x2 ffxWaveXorFloat16x2(FfxFloat16x2 v, FfxUInt32 x) +{ + return FFX_UINT32_TO_FLOAT16X2(WaveReadLaneAt(FFX_FLOAT16X2_TO_UINT32(v), WaveGetLaneIndex() ^ x)); +} +FfxFloat16x4 ffxWaveXorFloat16x4(FfxFloat16x4 v, FfxUInt32 x) +{ + return FFX_UINT32X2_TO_FLOAT16X4(WaveReadLaneAt(FFX_FLOAT16X4_TO_UINT32X2(v), WaveGetLaneIndex() ^ x)); +} +FfxUInt16x2 ffxWaveXorUint16x2(FfxUInt16x2 v, FfxUInt32 x) +{ + return FFX_UINT32_TO_UINT16X2(WaveReadLaneAt(FFX_UINT16X2_TO_UINT32(v), WaveGetLaneIndex() ^ x)); +} +FfxUInt16x4 ffxWaveXorUint16x4(FfxUInt16x4 v, FfxUInt32 x) +{ + return AW4_FFX_UINT32(WaveReadLaneAt(FFX_UINT32_AW4(v), WaveGetLaneIndex() ^ x)); +} +#endif // FFX_HALF +#endif // #if defined(FFX_WAVE) diff --git a/thirdparty/amd-fsr2/shaders/ffx_core_portability.h b/thirdparty/amd-fsr2/shaders/ffx_core_portability.h new file mode 100644 index 00000000000..45be05973a8 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_core_portability.h @@ -0,0 +1,50 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +FfxFloat32x3 opAAddOneF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32 b) +{ + d = a + ffxBroadcast3(b); + return d; +} + +FfxFloat32x3 opACpyF3(FfxFloat32x3 d, FfxFloat32x3 a) +{ + d = a; + return d; +} + +FfxFloat32x3 opAMulF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32x3 b) +{ + d = a * b; + return d; +} + +FfxFloat32x3 opAMulOneF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32 b) +{ + d = a * ffxBroadcast3(b); + return d; +} + +FfxFloat32x3 opARcpF3(FfxFloat32x3 d, FfxFloat32x3 a) +{ + d = rcp(a); + return d; +} diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr1.h b/thirdparty/amd-fsr2/shaders/ffx_fsr1.h new file mode 100644 index 00000000000..1ac23cf3de3 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr1.h @@ -0,0 +1,1250 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wunused-variable" +#endif + +/// Setup required constant values for EASU (works on CPU or GPU). +/// +/// @param [out] con0 +/// @param [out] con1 +/// @param [out] con2 +/// @param [out] con3 +/// @param [in] inputViewportInPixelsX The rendered image resolution being upscaled in X dimension. +/// @param [in] inputViewportInPixelsY The rendered image resolution being upscaled in Y dimension. +/// @param [in] inputSizeInPixelsX The resolution of the resource containing the input image (useful for dynamic resolution) in X dimension. +/// @param [in] inputSizeInPixelsY The resolution of the resource containing the input image (useful for dynamic resolution) in Y dimension. +/// @param [in] outputSizeInPixelsX The display resolution which the input image gets upscaled to in X dimension. +/// @param [in] outputSizeInPixelsY The display resolution which the input image gets upscaled to in Y dimension. +/// +/// @ingroup FSR1 +FFX_STATIC void ffxFsrPopulateEasuConstants( + FFX_PARAMETER_INOUT FfxUInt32x4 con0, + FFX_PARAMETER_INOUT FfxUInt32x4 con1, + FFX_PARAMETER_INOUT FfxUInt32x4 con2, + FFX_PARAMETER_INOUT FfxUInt32x4 con3, + FFX_PARAMETER_IN FfxFloat32 inputViewportInPixelsX, + FFX_PARAMETER_IN FfxFloat32 inputViewportInPixelsY, + FFX_PARAMETER_IN FfxFloat32 inputSizeInPixelsX, + FFX_PARAMETER_IN FfxFloat32 inputSizeInPixelsY, + FFX_PARAMETER_IN FfxFloat32 outputSizeInPixelsX, + FFX_PARAMETER_IN FfxFloat32 outputSizeInPixelsY) +{ + // Output integer position to a pixel position in viewport. + con0[0] = ffxAsUInt32(inputViewportInPixelsX * ffxReciprocal(outputSizeInPixelsX)); + con0[1] = ffxAsUInt32(inputViewportInPixelsY * ffxReciprocal(outputSizeInPixelsY)); + con0[2] = ffxAsUInt32(FfxFloat32(0.5) * inputViewportInPixelsX * ffxReciprocal(outputSizeInPixelsX) - FfxFloat32(0.5)); + con0[3] = ffxAsUInt32(FfxFloat32(0.5) * inputViewportInPixelsY * ffxReciprocal(outputSizeInPixelsY) - FfxFloat32(0.5)); + + // Viewport pixel position to normalized image space. + // This is used to get upper-left of 'F' tap. + con1[0] = ffxAsUInt32(ffxReciprocal(inputSizeInPixelsX)); + con1[1] = ffxAsUInt32(ffxReciprocal(inputSizeInPixelsY)); + + // Centers of gather4, first offset from upper-left of 'F'. + // +---+---+ + // | | | + // +--(0)--+ + // | b | c | + // +---F---+---+---+ + // | e | f | g | h | + // +--(1)--+--(2)--+ + // | i | j | k | l | + // +---+---+---+---+ + // | n | o | + // +--(3)--+ + // | | | + // +---+---+ + con1[2] = ffxAsUInt32(FfxFloat32(1.0) * ffxReciprocal(inputSizeInPixelsX)); + con1[3] = ffxAsUInt32(FfxFloat32(-1.0) * ffxReciprocal(inputSizeInPixelsY)); + + // These are from (0) instead of 'F'. + con2[0] = ffxAsUInt32(FfxFloat32(-1.0) * ffxReciprocal(inputSizeInPixelsX)); + con2[1] = ffxAsUInt32(FfxFloat32(2.0) * ffxReciprocal(inputSizeInPixelsY)); + con2[2] = ffxAsUInt32(FfxFloat32(1.0) * ffxReciprocal(inputSizeInPixelsX)); + con2[3] = ffxAsUInt32(FfxFloat32(2.0) * ffxReciprocal(inputSizeInPixelsY)); + con3[0] = ffxAsUInt32(FfxFloat32(0.0) * ffxReciprocal(inputSizeInPixelsX)); + con3[1] = ffxAsUInt32(FfxFloat32(4.0) * ffxReciprocal(inputSizeInPixelsY)); + con3[2] = con3[3] = 0; +} + +/// Setup required constant values for EASU (works on CPU or GPU). +/// +/// @param [out] con0 +/// @param [out] con1 +/// @param [out] con2 +/// @param [out] con3 +/// @param [in] inputViewportInPixelsX The resolution of the input in the X dimension. +/// @param [in] inputViewportInPixelsY The resolution of the input in the Y dimension. +/// @param [in] inputSizeInPixelsX The input size in pixels in the X dimension. +/// @param [in] inputSizeInPixelsY The input size in pixels in the Y dimension. +/// @param [in] outputSizeInPixelsX The output size in pixels in the X dimension. +/// @param [in] outputSizeInPixelsY The output size in pixels in the Y dimension. +/// @param [in] inputOffsetInPixelsX The input image offset in the X dimension into the resource containing it (useful for dynamic resolution). +/// @param [in] inputOffsetInPixelsY The input image offset in the Y dimension into the resource containing it (useful for dynamic resolution). +/// +/// @ingroup FSR1 +FFX_STATIC void ffxFsrPopulateEasuConstantsOffset( + FFX_PARAMETER_INOUT FfxUInt32x4 con0, + FFX_PARAMETER_INOUT FfxUInt32x4 con1, + FFX_PARAMETER_INOUT FfxUInt32x4 con2, + FFX_PARAMETER_INOUT FfxUInt32x4 con3, + FFX_PARAMETER_IN FfxFloat32 inputViewportInPixelsX, + FFX_PARAMETER_IN FfxFloat32 inputViewportInPixelsY, + FFX_PARAMETER_IN FfxFloat32 inputSizeInPixelsX, + FFX_PARAMETER_IN FfxFloat32 inputSizeInPixelsY, + FFX_PARAMETER_IN FfxFloat32 outputSizeInPixelsX, + FFX_PARAMETER_IN FfxFloat32 outputSizeInPixelsY, + FFX_PARAMETER_IN FfxFloat32 inputOffsetInPixelsX, + FFX_PARAMETER_IN FfxFloat32 inputOffsetInPixelsY) +{ + ffxFsrPopulateEasuConstants( + con0, + con1, + con2, + con3, + inputViewportInPixelsX, + inputViewportInPixelsY, + inputSizeInPixelsX, + inputSizeInPixelsY, + outputSizeInPixelsX, + outputSizeInPixelsY); + + // override + con0[2] = ffxAsUInt32(FfxFloat32(0.5) * inputViewportInPixelsX * ffxReciprocal(outputSizeInPixelsX) - FfxFloat32(0.5) + inputOffsetInPixelsX); + con0[3] = ffxAsUInt32(FfxFloat32(0.5) * inputViewportInPixelsY * ffxReciprocal(outputSizeInPixelsY) - FfxFloat32(0.5) + inputOffsetInPixelsY); +} + +#if defined(FFX_GPU) && defined(FFX_FSR_EASU_FLOAT) +// Input callback prototypes, need to be implemented by calling shader +FfxFloat32x4 FsrEasuRF(FfxFloat32x2 p); +FfxFloat32x4 FsrEasuGF(FfxFloat32x2 p); +FfxFloat32x4 FsrEasuBF(FfxFloat32x2 p); + +// Filtering for a given tap for the scalar. +void fsrEasuTapFloat( + FFX_PARAMETER_INOUT FfxFloat32x3 accumulatedColor, // Accumulated color, with negative lobe. + FFX_PARAMETER_INOUT FfxFloat32 accumulatedWeight, // Accumulated weight. + FFX_PARAMETER_IN FfxFloat32x2 pixelOffset, // Pixel offset from resolve position to tap. + FFX_PARAMETER_IN FfxFloat32x2 gradientDirection, // Gradient direction. + FFX_PARAMETER_IN FfxFloat32x2 length, // Length. + FFX_PARAMETER_IN FfxFloat32 negativeLobeStrength, // Negative lobe strength. + FFX_PARAMETER_IN FfxFloat32 clippingPoint, // Clipping point. + FFX_PARAMETER_IN FfxFloat32x3 color) // Tap color. +{ + // Rotate offset by direction. + FfxFloat32x2 rotatedOffset; + rotatedOffset.x = (pixelOffset.x * (gradientDirection.x)) + (pixelOffset.y * gradientDirection.y); + rotatedOffset.y = (pixelOffset.x * (-gradientDirection.y)) + (pixelOffset.y * gradientDirection.x); + + // Anisotropy. + rotatedOffset *= length; + + // Compute distance^2. + FfxFloat32 distanceSquared = rotatedOffset.x * rotatedOffset.x + rotatedOffset.y * rotatedOffset.y; + + // Limit to the window as at corner, 2 taps can easily be outside. + distanceSquared = ffxMin(distanceSquared, clippingPoint); + + // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x. + // (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2 + // |_______________________________________| |_______________| + // base window + // The general form of the 'base' is, + // (a*(b*x^2-1)^2-(a-1)) + // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe. + FfxFloat32 weightB = FfxFloat32(2.0 / 5.0) * distanceSquared + FfxFloat32(-1.0); + FfxFloat32 weightA = negativeLobeStrength * distanceSquared + FfxFloat32(-1.0); + weightB *= weightB; + weightA *= weightA; + weightB = FfxFloat32(25.0 / 16.0) * weightB + FfxFloat32(-(25.0 / 16.0 - 1.0)); + FfxFloat32 weight = weightB * weightA; + + // Do weighted average. + accumulatedColor += color * weight; + accumulatedWeight += weight; +} + +// Accumulate direction and length. +void fsrEasuSetFloat( + FFX_PARAMETER_INOUT FfxFloat32x2 direction, + FFX_PARAMETER_INOUT FfxFloat32 length, + FFX_PARAMETER_IN FfxFloat32x2 pp, + FFX_PARAMETER_IN FfxBoolean biS, + FFX_PARAMETER_IN FfxBoolean biT, + FFX_PARAMETER_IN FfxBoolean biU, + FFX_PARAMETER_IN FfxBoolean biV, + FFX_PARAMETER_IN FfxFloat32 lA, + FFX_PARAMETER_IN FfxFloat32 lB, + FFX_PARAMETER_IN FfxFloat32 lC, + FFX_PARAMETER_IN FfxFloat32 lD, + FFX_PARAMETER_IN FfxFloat32 lE) +{ + // Compute bilinear weight, branches factor out as predicates are compiler time immediates. + // s t + // u v + FfxFloat32 weight = FfxFloat32(0.0); + if (biS) + weight = (FfxFloat32(1.0) - pp.x) * (FfxFloat32(1.0) - pp.y); + if (biT) + weight = pp.x * (FfxFloat32(1.0) - pp.y); + if (biU) + weight = (FfxFloat32(1.0) - pp.x) * pp.y; + if (biV) + weight = pp.x * pp.y; + + // Direction is the '+' diff. + // a + // b c d + // e + // Then takes magnitude from abs average of both sides of 'c'. + // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms. + FfxFloat32 dc = lD - lC; + FfxFloat32 cb = lC - lB; + FfxFloat32 lengthX = max(abs(dc), abs(cb)); + lengthX = ffxApproximateReciprocal(lengthX); + FfxFloat32 directionX = lD - lB; + direction.x += directionX * weight; + lengthX = ffxSaturate(abs(directionX) * lengthX); + lengthX *= lengthX; + length += lengthX * weight; + + // Repeat for the y axis. + FfxFloat32 ec = lE - lC; + FfxFloat32 ca = lC - lA; + FfxFloat32 lengthY = max(abs(ec), abs(ca)); + lengthY = ffxApproximateReciprocal(lengthY); + FfxFloat32 directionY = lE - lA; + direction.y += directionY * weight; + lengthY = ffxSaturate(abs(directionY) * lengthY); + lengthY *= lengthY; + length += lengthY * weight; +} + +/// Apply edge-aware spatial upsampling using 32bit floating point precision calculations. +/// +/// @param [out] outPixel The computed color of a pixel. +/// @param [in] integerPosition Integer pixel position within the output. +/// @param [in] con0 The first constant value generated by ffxFsrPopulateEasuConstants. +/// @param [in] con1 The second constant value generated by ffxFsrPopulateEasuConstants. +/// @param [in] con2 The third constant value generated by ffxFsrPopulateEasuConstants. +/// @param [in] con3 The fourth constant value generated by ffxFsrPopulateEasuConstants. +/// +/// @ingroup FSR +void ffxFsrEasuFloat( + FFX_PARAMETER_OUT FfxFloat32x3 pix, + FFX_PARAMETER_IN FfxUInt32x2 ip, + FFX_PARAMETER_IN FfxUInt32x4 con0, + FFX_PARAMETER_IN FfxUInt32x4 con1, + FFX_PARAMETER_IN FfxUInt32x4 con2, + FFX_PARAMETER_IN FfxUInt32x4 con3) +{ + // Get position of 'f'. + FfxFloat32x2 pp = FfxFloat32x2(ip) * ffxAsFloat(con0.xy) + ffxAsFloat(con0.zw); + FfxFloat32x2 fp = floor(pp); + pp -= fp; + + // 12-tap kernel. + // b c + // e f g h + // i j k l + // n o + // Gather 4 ordering. + // a b + // r g + // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions, + // a b <- unused (z) + // r g + // a b a b + // r g r g + // a b + // r g <- unused (z) + // Allowing dead-code removal to remove the 'z's. + FfxFloat32x2 p0 = fp * ffxAsFloat(con1.xy) + ffxAsFloat(con1.zw); + + // These are from p0 to avoid pulling two constants on pre-Navi hardware. + FfxFloat32x2 p1 = p0 + ffxAsFloat(con2.xy); + FfxFloat32x2 p2 = p0 + ffxAsFloat(con2.zw); + FfxFloat32x2 p3 = p0 + ffxAsFloat(con3.xy); + FfxFloat32x4 bczzR = FsrEasuRF(p0); + FfxFloat32x4 bczzG = FsrEasuGF(p0); + FfxFloat32x4 bczzB = FsrEasuBF(p0); + FfxFloat32x4 ijfeR = FsrEasuRF(p1); + FfxFloat32x4 ijfeG = FsrEasuGF(p1); + FfxFloat32x4 ijfeB = FsrEasuBF(p1); + FfxFloat32x4 klhgR = FsrEasuRF(p2); + FfxFloat32x4 klhgG = FsrEasuGF(p2); + FfxFloat32x4 klhgB = FsrEasuBF(p2); + FfxFloat32x4 zzonR = FsrEasuRF(p3); + FfxFloat32x4 zzonG = FsrEasuGF(p3); + FfxFloat32x4 zzonB = FsrEasuBF(p3); + + // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD). + FfxFloat32x4 bczzL = bczzB * ffxBroadcast4(0.5) + (bczzR * ffxBroadcast4(0.5) + bczzG); + FfxFloat32x4 ijfeL = ijfeB * ffxBroadcast4(0.5) + (ijfeR * ffxBroadcast4(0.5) + ijfeG); + FfxFloat32x4 klhgL = klhgB * ffxBroadcast4(0.5) + (klhgR * ffxBroadcast4(0.5) + klhgG); + FfxFloat32x4 zzonL = zzonB * ffxBroadcast4(0.5) + (zzonR * ffxBroadcast4(0.5) + zzonG); + + // Rename. + FfxFloat32 bL = bczzL.x; + FfxFloat32 cL = bczzL.y; + FfxFloat32 iL = ijfeL.x; + FfxFloat32 jL = ijfeL.y; + FfxFloat32 fL = ijfeL.z; + FfxFloat32 eL = ijfeL.w; + FfxFloat32 kL = klhgL.x; + FfxFloat32 lL = klhgL.y; + FfxFloat32 hL = klhgL.z; + FfxFloat32 gL = klhgL.w; + FfxFloat32 oL = zzonL.z; + FfxFloat32 nL = zzonL.w; + + // Accumulate for bilinear interpolation. + FfxFloat32x2 dir = ffxBroadcast2(0.0); + FfxFloat32 len = FfxFloat32(0.0); + fsrEasuSetFloat(dir, len, pp, FFX_TRUE, FFX_FALSE, FFX_FALSE, FFX_FALSE, bL, eL, fL, gL, jL); + fsrEasuSetFloat(dir, len, pp, FFX_FALSE, FFX_TRUE, FFX_FALSE, FFX_FALSE, cL, fL, gL, hL, kL); + fsrEasuSetFloat(dir, len, pp, FFX_FALSE, FFX_FALSE, FFX_TRUE, FFX_FALSE, fL, iL, jL, kL, nL); + fsrEasuSetFloat(dir, len, pp, FFX_FALSE, FFX_FALSE, FFX_FALSE, FFX_TRUE, gL, jL, kL, lL, oL); + + // Normalize with approximation, and cleanup close to zero. + FfxFloat32x2 dir2 = dir * dir; + FfxFloat32 dirR = dir2.x + dir2.y; + FfxUInt32 zro = dirR < FfxFloat32(1.0 / 32768.0); + dirR = ffxApproximateReciprocalSquareRoot(dirR); + dirR = zro ? FfxFloat32(1.0) : dirR; + dir.x = zro ? FfxFloat32(1.0) : dir.x; + dir *= ffxBroadcast2(dirR); + + // Transform from {0 to 2} to {0 to 1} range, and shape with square. + len = len * FfxFloat32(0.5); + len *= len; + + // Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}. + FfxFloat32 stretch = (dir.x * dir.x + dir.y * dir.y) * ffxApproximateReciprocal(max(abs(dir.x), abs(dir.y))); + + // Anisotropic length after rotation, + // x := 1.0 lerp to 'stretch' on edges + // y := 1.0 lerp to 2x on edges + FfxFloat32x2 len2 = FfxFloat32x2(FfxFloat32(1.0) + (stretch - FfxFloat32(1.0)) * len, FfxFloat32(1.0) + FfxFloat32(-0.5) * len); + + // Based on the amount of 'edge', + // the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}. + FfxFloat32 lob = FfxFloat32(0.5) + FfxFloat32((1.0 / 4.0 - 0.04) - 0.5) * len; + + // Set distance^2 clipping point to the end of the adjustable window. + FfxFloat32 clp = ffxApproximateReciprocal(lob); + + // Accumulation mixed with min/max of 4 nearest. + // b c + // e f g h + // i j k l + // n o + FfxFloat32x3 min4 = + ffxMin(ffxMin3(FfxFloat32x3(ijfeR.z, ijfeG.z, ijfeB.z), FfxFloat32x3(klhgR.w, klhgG.w, klhgB.w), FfxFloat32x3(ijfeR.y, ijfeG.y, ijfeB.y)), + FfxFloat32x3(klhgR.x, klhgG.x, klhgB.x)); + FfxFloat32x3 max4 = + max(ffxMax3(FfxFloat32x3(ijfeR.z, ijfeG.z, ijfeB.z), FfxFloat32x3(klhgR.w, klhgG.w, klhgB.w), FfxFloat32x3(ijfeR.y, ijfeG.y, ijfeB.y)), FfxFloat32x3(klhgR.x, klhgG.x, klhgB.x)); + + // Accumulation. + FfxFloat32x3 aC = ffxBroadcast3(0.0); + FfxFloat32 aW = FfxFloat32(0.0); + fsrEasuTapFloat(aC, aW, FfxFloat32x2(0.0, -1.0) - pp, dir, len2, lob, clp, FfxFloat32x3(bczzR.x, bczzG.x, bczzB.x)); // b + fsrEasuTapFloat(aC, aW, FfxFloat32x2(1.0, -1.0) - pp, dir, len2, lob, clp, FfxFloat32x3(bczzR.y, bczzG.y, bczzB.y)); // c + fsrEasuTapFloat(aC, aW, FfxFloat32x2(-1.0, 1.0) - pp, dir, len2, lob, clp, FfxFloat32x3(ijfeR.x, ijfeG.x, ijfeB.x)); // i + fsrEasuTapFloat(aC, aW, FfxFloat32x2(0.0, 1.0) - pp, dir, len2, lob, clp, FfxFloat32x3(ijfeR.y, ijfeG.y, ijfeB.y)); // j + fsrEasuTapFloat(aC, aW, FfxFloat32x2(0.0, 0.0) - pp, dir, len2, lob, clp, FfxFloat32x3(ijfeR.z, ijfeG.z, ijfeB.z)); // f + fsrEasuTapFloat(aC, aW, FfxFloat32x2(-1.0, 0.0) - pp, dir, len2, lob, clp, FfxFloat32x3(ijfeR.w, ijfeG.w, ijfeB.w)); // e + fsrEasuTapFloat(aC, aW, FfxFloat32x2(1.0, 1.0) - pp, dir, len2, lob, clp, FfxFloat32x3(klhgR.x, klhgG.x, klhgB.x)); // k + fsrEasuTapFloat(aC, aW, FfxFloat32x2(2.0, 1.0) - pp, dir, len2, lob, clp, FfxFloat32x3(klhgR.y, klhgG.y, klhgB.y)); // l + fsrEasuTapFloat(aC, aW, FfxFloat32x2(2.0, 0.0) - pp, dir, len2, lob, clp, FfxFloat32x3(klhgR.z, klhgG.z, klhgB.z)); // h + fsrEasuTapFloat(aC, aW, FfxFloat32x2(1.0, 0.0) - pp, dir, len2, lob, clp, FfxFloat32x3(klhgR.w, klhgG.w, klhgB.w)); // g + fsrEasuTapFloat(aC, aW, FfxFloat32x2(1.0, 2.0) - pp, dir, len2, lob, clp, FfxFloat32x3(zzonR.z, zzonG.z, zzonB.z)); // o + fsrEasuTapFloat(aC, aW, FfxFloat32x2(0.0, 2.0) - pp, dir, len2, lob, clp, FfxFloat32x3(zzonR.w, zzonG.w, zzonB.w)); // n + + // Normalize and dering. + pix = ffxMin(max4, max(min4, aC * ffxBroadcast3(rcp(aW)))); +} +#endif // #if defined(FFX_GPU) && defined(FFX_FSR_EASU_FLOAT) + +#if defined(FFX_GPU) && FFX_HALF == 1 && defined(FFX_FSR_EASU_HALF) +// Input callback prototypes, need to be implemented by calling shader +FfxFloat16x4 FsrEasuRH(FfxFloat32x2 p); +FfxFloat16x4 FsrEasuGH(FfxFloat32x2 p); +FfxFloat16x4 FsrEasuBH(FfxFloat32x2 p); + +// This runs 2 taps in parallel. +void FsrEasuTapH( + FFX_PARAMETER_INOUT FfxFloat16x2 aCR, + FFX_PARAMETER_INOUT FfxFloat16x2 aCG, + FFX_PARAMETER_INOUT FfxFloat16x2 aCB, + FFX_PARAMETER_INOUT FfxFloat16x2 aW, + FFX_PARAMETER_IN FfxFloat16x2 offX, + FFX_PARAMETER_IN FfxFloat16x2 offY, + FFX_PARAMETER_IN FfxFloat16x2 dir, + FFX_PARAMETER_IN FfxFloat16x2 len, + FFX_PARAMETER_IN FfxFloat16 lob, + FFX_PARAMETER_IN FfxFloat16 clp, + FFX_PARAMETER_IN FfxFloat16x2 cR, + FFX_PARAMETER_IN FfxFloat16x2 cG, + FFX_PARAMETER_IN FfxFloat16x2 cB) +{ + FfxFloat16x2 vX, vY; + vX = offX * dir.xx + offY * dir.yy; + vY = offX * (-dir.yy) + offY * dir.xx; + vX *= len.x; + vY *= len.y; + FfxFloat16x2 d2 = vX * vX + vY * vY; + d2 = min(d2, FFX_BROADCAST_FLOAT16X2(clp)); + FfxFloat16x2 wB = FFX_BROADCAST_FLOAT16X2(2.0 / 5.0) * d2 + FFX_BROADCAST_FLOAT16X2(-1.0); + FfxFloat16x2 wA = FFX_BROADCAST_FLOAT16X2(lob) * d2 + FFX_BROADCAST_FLOAT16X2(-1.0); + wB *= wB; + wA *= wA; + wB = FFX_BROADCAST_FLOAT16X2(25.0 / 16.0) * wB + FFX_BROADCAST_FLOAT16X2(-(25.0 / 16.0 - 1.0)); + FfxFloat16x2 w = wB * wA; + aCR += cR * w; + aCG += cG * w; + aCB += cB * w; + aW += w; +} + +// This runs 2 taps in parallel. +void FsrEasuSetH( + FFX_PARAMETER_INOUT FfxFloat16x2 dirPX, + FFX_PARAMETER_INOUT FfxFloat16x2 dirPY, + FFX_PARAMETER_INOUT FfxFloat16x2 lenP, + FFX_PARAMETER_IN FfxFloat16x2 pp, + FFX_PARAMETER_IN FfxBoolean biST, + FFX_PARAMETER_IN FfxBoolean biUV, + FFX_PARAMETER_IN FfxFloat16x2 lA, + FFX_PARAMETER_IN FfxFloat16x2 lB, + FFX_PARAMETER_IN FfxFloat16x2 lC, + FFX_PARAMETER_IN FfxFloat16x2 lD, + FFX_PARAMETER_IN FfxFloat16x2 lE) +{ + FfxFloat16x2 w = FFX_BROADCAST_FLOAT16X2(0.0); + + if (biST) + w = (FfxFloat16x2(1.0, 0.0) + FfxFloat16x2(-pp.x, pp.x)) * FFX_BROADCAST_FLOAT16X2(FFX_BROADCAST_FLOAT16(1.0) - pp.y); + + if (biUV) + w = (FfxFloat16x2(1.0, 0.0) + FfxFloat16x2(-pp.x, pp.x)) * FFX_BROADCAST_FLOAT16X2(pp.y); + + // ABS is not free in the packed FP16 path. + FfxFloat16x2 dc = lD - lC; + FfxFloat16x2 cb = lC - lB; + FfxFloat16x2 lenX = max(abs(dc), abs(cb)); + lenX = ffxReciprocalHalf(lenX); + + FfxFloat16x2 dirX = lD - lB; + dirPX += dirX * w; + lenX = ffxSaturate(abs(dirX) * lenX); + lenX *= lenX; + lenP += lenX * w; + FfxFloat16x2 ec = lE - lC; + FfxFloat16x2 ca = lC - lA; + FfxFloat16x2 lenY = max(abs(ec), abs(ca)); + lenY = ffxReciprocalHalf(lenY); + FfxFloat16x2 dirY = lE - lA; + dirPY += dirY * w; + lenY = ffxSaturate(abs(dirY) * lenY); + lenY *= lenY; + lenP += lenY * w; +} + +void FsrEasuH( + FFX_PARAMETER_OUT FfxFloat16x3 pix, + FFX_PARAMETER_IN FfxUInt32x2 ip, + FFX_PARAMETER_IN FfxUInt32x4 con0, + FFX_PARAMETER_IN FfxUInt32x4 con1, + FFX_PARAMETER_IN FfxUInt32x4 con2, + FFX_PARAMETER_IN FfxUInt32x4 con3) +{ + FfxFloat32x2 pp = FfxFloat32x2(ip) * ffxAsFloat(con0.xy) + ffxAsFloat(con0.zw); + FfxFloat32x2 fp = floor(pp); + pp -= fp; + FfxFloat16x2 ppp = FfxFloat16x2(pp); + + FfxFloat32x2 p0 = fp * ffxAsFloat(con1.xy) + ffxAsFloat(con1.zw); + FfxFloat32x2 p1 = p0 + ffxAsFloat(con2.xy); + FfxFloat32x2 p2 = p0 + ffxAsFloat(con2.zw); + FfxFloat32x2 p3 = p0 + ffxAsFloat(con3.xy); + FfxFloat16x4 bczzR = FsrEasuRH(p0); + FfxFloat16x4 bczzG = FsrEasuGH(p0); + FfxFloat16x4 bczzB = FsrEasuBH(p0); + FfxFloat16x4 ijfeR = FsrEasuRH(p1); + FfxFloat16x4 ijfeG = FsrEasuGH(p1); + FfxFloat16x4 ijfeB = FsrEasuBH(p1); + FfxFloat16x4 klhgR = FsrEasuRH(p2); + FfxFloat16x4 klhgG = FsrEasuGH(p2); + FfxFloat16x4 klhgB = FsrEasuBH(p2); + FfxFloat16x4 zzonR = FsrEasuRH(p3); + FfxFloat16x4 zzonG = FsrEasuGH(p3); + FfxFloat16x4 zzonB = FsrEasuBH(p3); + + FfxFloat16x4 bczzL = bczzB * FFX_BROADCAST_FLOAT16X4(0.5) + (bczzR * FFX_BROADCAST_FLOAT16X4(0.5) + bczzG); + FfxFloat16x4 ijfeL = ijfeB * FFX_BROADCAST_FLOAT16X4(0.5) + (ijfeR * FFX_BROADCAST_FLOAT16X4(0.5) + ijfeG); + FfxFloat16x4 klhgL = klhgB * FFX_BROADCAST_FLOAT16X4(0.5) + (klhgR * FFX_BROADCAST_FLOAT16X4(0.5) + klhgG); + FfxFloat16x4 zzonL = zzonB * FFX_BROADCAST_FLOAT16X4(0.5) + (zzonR * FFX_BROADCAST_FLOAT16X4(0.5) + zzonG); + FfxFloat16 bL = bczzL.x; + FfxFloat16 cL = bczzL.y; + FfxFloat16 iL = ijfeL.x; + FfxFloat16 jL = ijfeL.y; + FfxFloat16 fL = ijfeL.z; + FfxFloat16 eL = ijfeL.w; + FfxFloat16 kL = klhgL.x; + FfxFloat16 lL = klhgL.y; + FfxFloat16 hL = klhgL.z; + FfxFloat16 gL = klhgL.w; + FfxFloat16 oL = zzonL.z; + FfxFloat16 nL = zzonL.w; + + // This part is different, accumulating 2 taps in parallel. + FfxFloat16x2 dirPX = FFX_BROADCAST_FLOAT16X2(0.0); + FfxFloat16x2 dirPY = FFX_BROADCAST_FLOAT16X2(0.0); + FfxFloat16x2 lenP = FFX_BROADCAST_FLOAT16X2(0.0); + FsrEasuSetH(dirPX, + dirPY, + lenP, + ppp, + FfxUInt32(true), + FfxUInt32(false), + FfxFloat16x2(bL, cL), + FfxFloat16x2(eL, fL), + FfxFloat16x2(fL, gL), + FfxFloat16x2(gL, hL), + FfxFloat16x2(jL, kL)); + FsrEasuSetH(dirPX, + dirPY, + lenP, + ppp, + FfxUInt32(false), + FfxUInt32(true), + FfxFloat16x2(fL, gL), + FfxFloat16x2(iL, jL), + FfxFloat16x2(jL, kL), + FfxFloat16x2(kL, lL), + FfxFloat16x2(nL, oL)); + FfxFloat16x2 dir = FfxFloat16x2(dirPX.r + dirPX.g, dirPY.r + dirPY.g); + FfxFloat16 len = lenP.r + lenP.g; + + FfxFloat16x2 dir2 = dir * dir; + FfxFloat16 dirR = dir2.x + dir2.y; + FfxBoolean zro = FfxBoolean(dirR < FFX_BROADCAST_FLOAT16(1.0 / 32768.0)); + dirR = ffxApproximateReciprocalSquareRootHalf(dirR); + dirR = (zro > 0) ? FFX_BROADCAST_FLOAT16(1.0) : dirR; + dir.x = (zro > 0) ? FFX_BROADCAST_FLOAT16(1.0) : dir.x; + dir *= FFX_BROADCAST_FLOAT16X2(dirR); + len = len * FFX_BROADCAST_FLOAT16(0.5); + len *= len; + FfxFloat16 stretch = (dir.x * dir.x + dir.y * dir.y) * ffxApproximateReciprocalHalf(max(abs(dir.x), abs(dir.y))); + FfxFloat16x2 len2 = + FfxFloat16x2(FFX_BROADCAST_FLOAT16(1.0) + (stretch - FFX_BROADCAST_FLOAT16(1.0)) * len, FFX_BROADCAST_FLOAT16(1.0) + FFX_BROADCAST_FLOAT16(-0.5) * len); + FfxFloat16 lob = FFX_BROADCAST_FLOAT16(0.5) + FFX_BROADCAST_FLOAT16((1.0 / 4.0 - 0.04) - 0.5) * len; + FfxFloat16 clp = ffxApproximateReciprocalHalf(lob); + + // FP16 is different, using packed trick to do min and max in same operation. + FfxFloat16x2 bothR = + max(max(FfxFloat16x2(-ijfeR.z, ijfeR.z), FfxFloat16x2(-klhgR.w, klhgR.w)), max(FfxFloat16x2(-ijfeR.y, ijfeR.y), FfxFloat16x2(-klhgR.x, klhgR.x))); + FfxFloat16x2 bothG = + max(max(FfxFloat16x2(-ijfeG.z, ijfeG.z), FfxFloat16x2(-klhgG.w, klhgG.w)), max(FfxFloat16x2(-ijfeG.y, ijfeG.y), FfxFloat16x2(-klhgG.x, klhgG.x))); + FfxFloat16x2 bothB = + max(max(FfxFloat16x2(-ijfeB.z, ijfeB.z), FfxFloat16x2(-klhgB.w, klhgB.w)), max(FfxFloat16x2(-ijfeB.y, ijfeB.y), FfxFloat16x2(-klhgB.x, klhgB.x))); + + // This part is different for FP16, working pairs of taps at a time. + FfxFloat16x2 pR = FFX_BROADCAST_FLOAT16X2(0.0); + FfxFloat16x2 pG = FFX_BROADCAST_FLOAT16X2(0.0); + FfxFloat16x2 pB = FFX_BROADCAST_FLOAT16X2(0.0); + FfxFloat16x2 pW = FFX_BROADCAST_FLOAT16X2(0.0); + FsrEasuTapH(pR, pG, pB, pW, FfxFloat16x2(0.0, 1.0) - ppp.xx, FfxFloat16x2(-1.0, -1.0) - ppp.yy, dir, len2, lob, clp, bczzR.xy, bczzG.xy, bczzB.xy); + FsrEasuTapH(pR, pG, pB, pW, FfxFloat16x2(-1.0, 0.0) - ppp.xx, FfxFloat16x2(1.0, 1.0) - ppp.yy, dir, len2, lob, clp, ijfeR.xy, ijfeG.xy, ijfeB.xy); + FsrEasuTapH(pR, pG, pB, pW, FfxFloat16x2(0.0, -1.0) - ppp.xx, FfxFloat16x2(0.0, 0.0) - ppp.yy, dir, len2, lob, clp, ijfeR.zw, ijfeG.zw, ijfeB.zw); + FsrEasuTapH(pR, pG, pB, pW, FfxFloat16x2(1.0, 2.0) - ppp.xx, FfxFloat16x2(1.0, 1.0) - ppp.yy, dir, len2, lob, clp, klhgR.xy, klhgG.xy, klhgB.xy); + FsrEasuTapH(pR, pG, pB, pW, FfxFloat16x2(2.0, 1.0) - ppp.xx, FfxFloat16x2(0.0, 0.0) - ppp.yy, dir, len2, lob, clp, klhgR.zw, klhgG.zw, klhgB.zw); + FsrEasuTapH(pR, pG, pB, pW, FfxFloat16x2(1.0, 0.0) - ppp.xx, FfxFloat16x2(2.0, 2.0) - ppp.yy, dir, len2, lob, clp, zzonR.zw, zzonG.zw, zzonB.zw); + FfxFloat16x3 aC = FfxFloat16x3(pR.x + pR.y, pG.x + pG.y, pB.x + pB.y); + FfxFloat16 aW = pW.x + pW.y; + + // Slightly different for FP16 version due to combined min and max. + pix = min(FfxFloat16x3(bothR.y, bothG.y, bothB.y), max(-FfxFloat16x3(bothR.x, bothG.x, bothB.x), aC * FFX_BROADCAST_FLOAT16X3(ffxReciprocalHalf(aW)))); +} +#endif // #if defined(FFX_GPU) && defined(FFX_HALF) && defined(FFX_FSR_EASU_HALF) + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [RCAS] ROBUST CONTRAST ADAPTIVE SHARPENING +// +//------------------------------------------------------------------------------------------------------------------------------ +// CAS uses a simplified mechanism to convert local contrast into a variable amount of sharpness. +// RCAS uses a more exact mechanism, solving for the maximum local sharpness possible before clipping. +// RCAS also has a built in process to limit sharpening of what it detects as possible noise. +// RCAS sharper does not support scaling, as it should be applied after EASU scaling. +// Pass EASU output straight into RCAS, no color conversions necessary. +//------------------------------------------------------------------------------------------------------------------------------ +// RCAS is based on the following logic. +// RCAS uses a 5 tap filter in a cross pattern (same as CAS), +// w n +// w 1 w for taps w m e +// w s +// Where 'w' is the negative lobe weight. +// output = (w*(n+e+w+s)+m)/(4*w+1) +// RCAS solves for 'w' by seeing where the signal might clip out of the {0 to 1} input range, +// 0 == (w*(n+e+w+s)+m)/(4*w+1) -> w = -m/(n+e+w+s) +// 1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1) +// Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount. +// This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues. +// So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps. +// As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation. +// This stabilizes RCAS. +// RCAS does a simple highpass which is normalized against the local contrast then shaped, +// 0.25 +// 0.25 -1 0.25 +// 0.25 +// This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges. +// +// GLSL example for the required callbacks : +// +// FfxFloat16x4 FsrRcasLoadH(FfxInt16x2 p){return FfxFloat16x4(imageLoad(imgSrc,FfxInt32x2(p)));} +// void FsrRcasInputH(inout FfxFloat16 r,inout FfxFloat16 g,inout FfxFloat16 b) +// { +// //do any simple input color conversions here or leave empty if none needed +// } +// +// FsrRcasCon need to be called from the CPU or GPU to set up constants. +// Including a GPU example here, the 'con' value would be stored out to a constant buffer. +// +// FfxUInt32x4 con; +// FsrRcasCon(con, +// 0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +// --------------- +// RCAS sharpening supports a CAS-like pass-through alpha via, +// #define FSR_RCAS_PASSTHROUGH_ALPHA 1 +// RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise. +// Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define, +// #define FSR_RCAS_DENOISE 1 +//============================================================================================================================== +// This is set at the limit of providing unnatural results for sharpening. +#define FSR_RCAS_LIMIT (0.25-(1.0/16.0)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). + FFX_STATIC void FsrRcasCon(FfxUInt32x4 con, + // The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. + FfxFloat32 sharpness) + { + // Transform from stops to linear value. + sharpness = exp2(-sharpness); + FfxFloat32x2 hSharp = {sharpness, sharpness}; + con[0] = ffxAsUInt32(sharpness); + con[1] = packHalf2x16(hSharp); + con[2] = 0; + con[3] = 0; + } + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(FFX_GPU)&&defined(FSR_RCAS_F) + // Input callback prototypes that need to be implemented by calling shader + FfxFloat32x4 FsrRcasLoadF(FfxInt32x2 p); + void FsrRcasInputF(inout FfxFloat32 r,inout FfxFloat32 g,inout FfxFloat32 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasF(out FfxFloat32 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out FfxFloat32 pixG, + out FfxFloat32 pixB, +#ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out FfxFloat32 pixA, +#endif + FfxUInt32x2 ip, // Integer pixel position in output. + FfxUInt32x4 con) + { // Constant generated by RcasSetup(). + // Algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + FfxInt32x2 sp = FfxInt32x2(ip); + FfxFloat32x3 b = FsrRcasLoadF(sp + FfxInt32x2(0, -1)).rgb; + FfxFloat32x3 d = FsrRcasLoadF(sp + FfxInt32x2(-1, 0)).rgb; +#ifdef FSR_RCAS_PASSTHROUGH_ALPHA + FfxFloat32x4 ee = FsrRcasLoadF(sp); + FfxFloat32x3 e = ee.rgb; + pixA = ee.a; +#else + FfxFloat32x3 e = FsrRcasLoadF(sp).rgb; +#endif + FfxFloat32x3 f = FsrRcasLoadF(sp + FfxInt32x2(1, 0)).rgb; + FfxFloat32x3 h = FsrRcasLoadF(sp + FfxInt32x2(0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + FfxFloat32 bR = b.r; + FfxFloat32 bG = b.g; + FfxFloat32 bB = b.b; + FfxFloat32 dR = d.r; + FfxFloat32 dG = d.g; + FfxFloat32 dB = d.b; + FfxFloat32 eR = e.r; + FfxFloat32 eG = e.g; + FfxFloat32 eB = e.b; + FfxFloat32 fR = f.r; + FfxFloat32 fG = f.g; + FfxFloat32 fB = f.b; + FfxFloat32 hR = h.r; + FfxFloat32 hG = h.g; + FfxFloat32 hB = h.b; + // Run optional input transform. + FsrRcasInputF(bR, bG, bB); + FsrRcasInputF(dR, dG, dB); + FsrRcasInputF(eR, eG, eB); + FsrRcasInputF(fR, fG, fB); + FsrRcasInputF(hR, hG, hB); + // Luma times 2. + FfxFloat32 bL = bB * FfxFloat32(0.5) + (bR * FfxFloat32(0.5) + bG); + FfxFloat32 dL = dB * FfxFloat32(0.5) + (dR * FfxFloat32(0.5) + dG); + FfxFloat32 eL = eB * FfxFloat32(0.5) + (eR * FfxFloat32(0.5) + eG); + FfxFloat32 fL = fB * FfxFloat32(0.5) + (fR * FfxFloat32(0.5) + fG); + FfxFloat32 hL = hB * FfxFloat32(0.5) + (hR * FfxFloat32(0.5) + hG); + // Noise detection. + FfxFloat32 nz = FfxFloat32(0.25) * bL + FfxFloat32(0.25) * dL + FfxFloat32(0.25) * fL + FfxFloat32(0.25) * hL - eL; + nz = ffxSaturate(abs(nz) * ffxApproximateReciprocalMedium(ffxMax3(ffxMax3(bL, dL, eL), fL, hL) - ffxMin3(ffxMin3(bL, dL, eL), fL, hL))); + nz = FfxFloat32(-0.5) * nz + FfxFloat32(1.0); + // Min and max of ring. + FfxFloat32 mn4R = ffxMin(ffxMin3(bR, dR, fR), hR); + FfxFloat32 mn4G = ffxMin(ffxMin3(bG, dG, fG), hG); + FfxFloat32 mn4B = ffxMin(ffxMin3(bB, dB, fB), hB); + FfxFloat32 mx4R = max(ffxMax3(bR, dR, fR), hR); + FfxFloat32 mx4G = max(ffxMax3(bG, dG, fG), hG); + FfxFloat32 mx4B = max(ffxMax3(bB, dB, fB), hB); + // Immediate constants for peak range. + FfxFloat32x2 peakC = FfxFloat32x2(1.0, -1.0 * 4.0); + // Limiters, these need to be high precision RCPs. + FfxFloat32 hitMinR = mn4R * rcp(FfxFloat32(4.0) * mx4R); + FfxFloat32 hitMinG = mn4G * rcp(FfxFloat32(4.0) * mx4G); + FfxFloat32 hitMinB = mn4B * rcp(FfxFloat32(4.0) * mx4B); + FfxFloat32 hitMaxR = (peakC.x - mx4R) * rcp(FfxFloat32(4.0) * mn4R + peakC.y); + FfxFloat32 hitMaxG = (peakC.x - mx4G) * rcp(FfxFloat32(4.0) * mn4G + peakC.y); + FfxFloat32 hitMaxB = (peakC.x - mx4B) * rcp(FfxFloat32(4.0) * mn4B + peakC.y); + FfxFloat32 lobeR = max(-hitMinR, hitMaxR); + FfxFloat32 lobeG = max(-hitMinG, hitMaxG); + FfxFloat32 lobeB = max(-hitMinB, hitMaxB); + FfxFloat32 lobe = max(FfxFloat32(-FSR_RCAS_LIMIT), ffxMin(ffxMax3(lobeR, lobeG, lobeB), FfxFloat32(0.0))) * ffxAsFloat + (con.x); + // Apply noise removal. +#ifdef FSR_RCAS_DENOISE + lobe *= nz; +#endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + FfxFloat32 rcpL = ffxApproximateReciprocalMedium(FfxFloat32(4.0) * lobe + FfxFloat32(1.0)); + pixR = (lobe * bR + lobe * dR + lobe * hR + lobe * fR + eR) * rcpL; + pixG = (lobe * bG + lobe * dG + lobe * hG + lobe * fG + eG) * rcpL; + pixB = (lobe * bB + lobe * dB + lobe * hB + lobe * fB + eB) * rcpL; + return; + } +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(FFX_GPU) && FFX_HALF == 1 && defined(FSR_RCAS_H) + // Input callback prototypes that need to be implemented by calling shader + FfxFloat16x4 FsrRcasLoadH(FfxInt16x2 p); + void FsrRcasInputH(inout FfxFloat16 r,inout FfxFloat16 g,inout FfxFloat16 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasH( + out FfxFloat16 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out FfxFloat16 pixG, + out FfxFloat16 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out FfxFloat16 pixA, + #endif + FfxUInt32x2 ip, // Integer pixel position in output. + FfxUInt32x4 con){ // Constant generated by RcasSetup(). + // Sharpening algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + FfxInt16x2 sp=FfxInt16x2(ip); + FfxFloat16x3 b=FsrRcasLoadH(sp+FfxInt16x2( 0,-1)).rgb; + FfxFloat16x3 d=FsrRcasLoadH(sp+FfxInt16x2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + FfxFloat16x4 ee=FsrRcasLoadH(sp); + FfxFloat16x3 e=ee.rgb;pixA=ee.a; + #else + FfxFloat16x3 e=FsrRcasLoadH(sp).rgb; + #endif + FfxFloat16x3 f=FsrRcasLoadH(sp+FfxInt16x2( 1, 0)).rgb; + FfxFloat16x3 h=FsrRcasLoadH(sp+FfxInt16x2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + FfxFloat16 bR=b.r; + FfxFloat16 bG=b.g; + FfxFloat16 bB=b.b; + FfxFloat16 dR=d.r; + FfxFloat16 dG=d.g; + FfxFloat16 dB=d.b; + FfxFloat16 eR=e.r; + FfxFloat16 eG=e.g; + FfxFloat16 eB=e.b; + FfxFloat16 fR=f.r; + FfxFloat16 fG=f.g; + FfxFloat16 fB=f.b; + FfxFloat16 hR=h.r; + FfxFloat16 hG=h.g; + FfxFloat16 hB=h.b; + // Run optional input transform. + FsrRcasInputH(bR,bG,bB); + FsrRcasInputH(dR,dG,dB); + FsrRcasInputH(eR,eG,eB); + FsrRcasInputH(fR,fG,fB); + FsrRcasInputH(hR,hG,hB); + // Luma times 2. + FfxFloat16 bL=bB*FFX_BROADCAST_FLOAT16(0.5)+(bR*FFX_BROADCAST_FLOAT16(0.5)+bG); + FfxFloat16 dL=dB*FFX_BROADCAST_FLOAT16(0.5)+(dR*FFX_BROADCAST_FLOAT16(0.5)+dG); + FfxFloat16 eL=eB*FFX_BROADCAST_FLOAT16(0.5)+(eR*FFX_BROADCAST_FLOAT16(0.5)+eG); + FfxFloat16 fL=fB*FFX_BROADCAST_FLOAT16(0.5)+(fR*FFX_BROADCAST_FLOAT16(0.5)+fG); + FfxFloat16 hL=hB*FFX_BROADCAST_FLOAT16(0.5)+(hR*FFX_BROADCAST_FLOAT16(0.5)+hG); + // Noise detection. + FfxFloat16 nz=FFX_BROADCAST_FLOAT16(0.25)*bL+FFX_BROADCAST_FLOAT16(0.25)*dL+FFX_BROADCAST_FLOAT16(0.25)*fL+FFX_BROADCAST_FLOAT16(0.25)*hL-eL; + nz=ffxSaturate(abs(nz)*ffxApproximateReciprocalMediumHalf(ffxMax3Half(ffxMax3Half(bL,dL,eL),fL,hL)-ffxMin3Half(ffxMin3Half(bL,dL,eL),fL,hL))); + nz=FFX_BROADCAST_FLOAT16(-0.5)*nz+FFX_BROADCAST_FLOAT16(1.0); + // Min and max of ring. + FfxFloat16 mn4R=min(ffxMin3Half(bR,dR,fR),hR); + FfxFloat16 mn4G=min(ffxMin3Half(bG,dG,fG),hG); + FfxFloat16 mn4B=min(ffxMin3Half(bB,dB,fB),hB); + FfxFloat16 mx4R=max(ffxMax3Half(bR,dR,fR),hR); + FfxFloat16 mx4G=max(ffxMax3Half(bG,dG,fG),hG); + FfxFloat16 mx4B=max(ffxMax3Half(bB,dB,fB),hB); + // Immediate constants for peak range. + FfxFloat16x2 peakC=FfxFloat16x2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + FfxFloat16 hitMinR=mn4R*ffxReciprocalHalf(FFX_BROADCAST_FLOAT16(4.0)*mx4R); + FfxFloat16 hitMinG=mn4G*ffxReciprocalHalf(FFX_BROADCAST_FLOAT16(4.0)*mx4G); + FfxFloat16 hitMinB=mn4B*ffxReciprocalHalf(FFX_BROADCAST_FLOAT16(4.0)*mx4B); + FfxFloat16 hitMaxR=(peakC.x-mx4R)*ffxReciprocalHalf(FFX_BROADCAST_FLOAT16(4.0)*mn4R+peakC.y); + FfxFloat16 hitMaxG=(peakC.x-mx4G)*ffxReciprocalHalf(FFX_BROADCAST_FLOAT16(4.0)*mn4G+peakC.y); + FfxFloat16 hitMaxB=(peakC.x-mx4B)*ffxReciprocalHalf(FFX_BROADCAST_FLOAT16(4.0)*mn4B+peakC.y); + FfxFloat16 lobeR=max(-hitMinR,hitMaxR); + FfxFloat16 lobeG=max(-hitMinG,hitMaxG); + FfxFloat16 lobeB=max(-hitMinB,hitMaxB); + FfxFloat16 lobe=max(FFX_BROADCAST_FLOAT16(-FSR_RCAS_LIMIT),min(ffxMax3Half(lobeR,lobeG,lobeB),FFX_BROADCAST_FLOAT16(0.0)))*FFX_UINT32_TO_FLOAT16X2(con.y).x; + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + FfxFloat16 rcpL=ffxApproximateReciprocalMediumHalf(FFX_BROADCAST_FLOAT16(4.0)*lobe+FFX_BROADCAST_FLOAT16(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL; +} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(FFX_GPU)&& FFX_HALF == 1 && defined(FSR_RCAS_HX2) + // Input callback prototypes that need to be implemented by the calling shader + FfxFloat16x4 FsrRcasLoadHx2(FfxInt16x2 p); + void FsrRcasInputHx2(inout FfxFloat16x2 r,inout FfxFloat16x2 g,inout FfxFloat16x2 b); +//------------------------------------------------------------------------------------------------------------------------------ + // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store. + void FsrRcasDepackHx2(out FfxFloat16x4 pix0,out FfxFloat16x4 pix1,FfxFloat16x2 pixR,FfxFloat16x2 pixG,FfxFloat16x2 pixB){ + #ifdef FFX_HLSL + // Invoke a slower path for DX only, since it won't allow uninitialized values. + pix0.a=pix1.a=0.0; + #endif + pix0.rgb=FfxFloat16x3(pixR.x,pixG.x,pixB.x); + pix1.rgb=FfxFloat16x3(pixR.y,pixG.y,pixB.y);} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasHx2( + // Output values are for 2 8x8 tiles in a 16x8 region. + // pix.x = left 8x8 tile + // pix.y = right 8x8 tile + // This enables later processing to easily be packed as well. + out FfxFloat16x2 pixR, + out FfxFloat16x2 pixG, + out FfxFloat16x2 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out FfxFloat16x2 pixA, + #endif + FfxUInt32x2 ip, // Integer pixel position in output. + FfxUInt32x4 con){ // Constant generated by RcasSetup(). + // No scaling algorithm uses minimal 3x3 pixel neighborhood. + FfxInt16x2 sp0=FfxInt16x2(ip); + FfxFloat16x3 b0=FsrRcasLoadHx2(sp0+FfxInt16x2( 0,-1)).rgb; + FfxFloat16x3 d0=FsrRcasLoadHx2(sp0+FfxInt16x2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + FfxFloat16x4 ee0=FsrRcasLoadHx2(sp0); + FfxFloat16x3 e0=ee0.rgb;pixA.r=ee0.a; + #else + FfxFloat16x3 e0=FsrRcasLoadHx2(sp0).rgb; + #endif + FfxFloat16x3 f0=FsrRcasLoadHx2(sp0+FfxInt16x2( 1, 0)).rgb; + FfxFloat16x3 h0=FsrRcasLoadHx2(sp0+FfxInt16x2( 0, 1)).rgb; + FfxInt16x2 sp1=sp0+FfxInt16x2(8,0); + FfxFloat16x3 b1=FsrRcasLoadHx2(sp1+FfxInt16x2( 0,-1)).rgb; + FfxFloat16x3 d1=FsrRcasLoadHx2(sp1+FfxInt16x2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + FfxFloat16x4 ee1=FsrRcasLoadHx2(sp1); + FfxFloat16x3 e1=ee1.rgb;pixA.g=ee1.a; + #else + FfxFloat16x3 e1=FsrRcasLoadHx2(sp1).rgb; + #endif + FfxFloat16x3 f1=FsrRcasLoadHx2(sp1+FfxInt16x2( 1, 0)).rgb; + FfxFloat16x3 h1=FsrRcasLoadHx2(sp1+FfxInt16x2( 0, 1)).rgb; + // Arrays of Structures to Structures of Arrays conversion. + FfxFloat16x2 bR=FfxFloat16x2(b0.r,b1.r); + FfxFloat16x2 bG=FfxFloat16x2(b0.g,b1.g); + FfxFloat16x2 bB=FfxFloat16x2(b0.b,b1.b); + FfxFloat16x2 dR=FfxFloat16x2(d0.r,d1.r); + FfxFloat16x2 dG=FfxFloat16x2(d0.g,d1.g); + FfxFloat16x2 dB=FfxFloat16x2(d0.b,d1.b); + FfxFloat16x2 eR=FfxFloat16x2(e0.r,e1.r); + FfxFloat16x2 eG=FfxFloat16x2(e0.g,e1.g); + FfxFloat16x2 eB=FfxFloat16x2(e0.b,e1.b); + FfxFloat16x2 fR=FfxFloat16x2(f0.r,f1.r); + FfxFloat16x2 fG=FfxFloat16x2(f0.g,f1.g); + FfxFloat16x2 fB=FfxFloat16x2(f0.b,f1.b); + FfxFloat16x2 hR=FfxFloat16x2(h0.r,h1.r); + FfxFloat16x2 hG=FfxFloat16x2(h0.g,h1.g); + FfxFloat16x2 hB=FfxFloat16x2(h0.b,h1.b); + // Run optional input transform. + FsrRcasInputHx2(bR,bG,bB); + FsrRcasInputHx2(dR,dG,dB); + FsrRcasInputHx2(eR,eG,eB); + FsrRcasInputHx2(fR,fG,fB); + FsrRcasInputHx2(hR,hG,hB); + // Luma times 2. + FfxFloat16x2 bL=bB*FFX_BROADCAST_FLOAT16X2(0.5)+(bR*FFX_BROADCAST_FLOAT16X2(0.5)+bG); + FfxFloat16x2 dL=dB*FFX_BROADCAST_FLOAT16X2(0.5)+(dR*FFX_BROADCAST_FLOAT16X2(0.5)+dG); + FfxFloat16x2 eL=eB*FFX_BROADCAST_FLOAT16X2(0.5)+(eR*FFX_BROADCAST_FLOAT16X2(0.5)+eG); + FfxFloat16x2 fL=fB*FFX_BROADCAST_FLOAT16X2(0.5)+(fR*FFX_BROADCAST_FLOAT16X2(0.5)+fG); + FfxFloat16x2 hL=hB*FFX_BROADCAST_FLOAT16X2(0.5)+(hR*FFX_BROADCAST_FLOAT16X2(0.5)+hG); + // Noise detection. + FfxFloat16x2 nz=FFX_BROADCAST_FLOAT16X2(0.25)*bL+FFX_BROADCAST_FLOAT16X2(0.25)*dL+FFX_BROADCAST_FLOAT16X2(0.25)*fL+FFX_BROADCAST_FLOAT16X2(0.25)*hL-eL; + nz=ffxSaturate(abs(nz)*ffxApproximateReciprocalMediumHalf(ffxMax3Half(ffxMax3Half(bL,dL,eL),fL,hL)-ffxMin3Half(ffxMin3Half(bL,dL,eL),fL,hL))); + nz=FFX_BROADCAST_FLOAT16X2(-0.5)*nz+FFX_BROADCAST_FLOAT16X2(1.0); + // Min and max of ring. + FfxFloat16x2 mn4R=min(ffxMin3Half(bR,dR,fR),hR); + FfxFloat16x2 mn4G=min(ffxMin3Half(bG,dG,fG),hG); + FfxFloat16x2 mn4B=min(ffxMin3Half(bB,dB,fB),hB); + FfxFloat16x2 mx4R=max(ffxMax3Half(bR,dR,fR),hR); + FfxFloat16x2 mx4G=max(ffxMax3Half(bG,dG,fG),hG); + FfxFloat16x2 mx4B=max(ffxMax3Half(bB,dB,fB),hB); + // Immediate constants for peak range. + FfxFloat16x2 peakC=FfxFloat16x2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + FfxFloat16x2 hitMinR=mn4R*ffxReciprocalHalf(FFX_BROADCAST_FLOAT16X2(4.0)*mx4R); + FfxFloat16x2 hitMinG=mn4G*ffxReciprocalHalf(FFX_BROADCAST_FLOAT16X2(4.0)*mx4G); + FfxFloat16x2 hitMinB=mn4B*ffxReciprocalHalf(FFX_BROADCAST_FLOAT16X2(4.0)*mx4B); + FfxFloat16x2 hitMaxR=(peakC.x-mx4R)*ffxReciprocalHalf(FFX_BROADCAST_FLOAT16X2(4.0)*mn4R+peakC.y); + FfxFloat16x2 hitMaxG=(peakC.x-mx4G)*ffxReciprocalHalf(FFX_BROADCAST_FLOAT16X2(4.0)*mn4G+peakC.y); + FfxFloat16x2 hitMaxB=(peakC.x-mx4B)*ffxReciprocalHalf(FFX_BROADCAST_FLOAT16X2(4.0)*mn4B+peakC.y); + FfxFloat16x2 lobeR=max(-hitMinR,hitMaxR); + FfxFloat16x2 lobeG=max(-hitMinG,hitMaxG); + FfxFloat16x2 lobeB=max(-hitMinB,hitMaxB); + FfxFloat16x2 lobe=max(FFX_BROADCAST_FLOAT16X2(-FSR_RCAS_LIMIT),min(ffxMax3Half(lobeR,lobeG,lobeB),FFX_BROADCAST_FLOAT16X2(0.0)))*FFX_BROADCAST_FLOAT16X2(FFX_UINT32_TO_FLOAT16X2(con.y).x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + FfxFloat16x2 rcpL=ffxApproximateReciprocalMediumHalf(FFX_BROADCAST_FLOAT16X2(4.0)*lobe+FFX_BROADCAST_FLOAT16X2(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR +// +//------------------------------------------------------------------------------------------------------------------------------ +// Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts. +// Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel. +// The 'Lfga*()' functions provide a convenient way to introduce grain. +// These functions limit grain based on distance to signal limits. +// This is done so that the grain is temporally energy preserving, and thus won't modify image tonality. +// Grain application should be done in a linear colorspace. +// The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased). +//------------------------------------------------------------------------------------------------------------------------------ +// Usage, +// FsrLfga*( +// color, // In/out linear colorspace color {0 to 1} ranged. +// grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain. +// amount); // Amount of grain (0 to 1} ranged. +//------------------------------------------------------------------------------------------------------------------------------ +// Example if grain texture is monochrome: 'FsrLfgaF(color,ffxBroadcast3(grain),amount)' +//============================================================================================================================== +#if defined(FFX_GPU) + // Maximum grain is the minimum distance to the signal limit. + void FsrLfgaF(inout FfxFloat32x3 c, FfxFloat32x3 t, FfxFloat32 a) + { + c += (t * ffxBroadcast3(a)) * ffxMin(ffxBroadcast3(1.0) - c, c); + } +#endif +//============================================================================================================================== +#if defined(FFX_GPU)&& FFX_HALF == 1 + // Half precision version (slower). + void FsrLfgaH(inout FfxFloat16x3 c, FfxFloat16x3 t, FfxFloat16 a) + { + c += (t * FFX_BROADCAST_FLOAT16X3(a)) * min(FFX_BROADCAST_FLOAT16X3(1.0) - c, c); + } + //------------------------------------------------------------------------------------------------------------------------------ + // Packed half precision version (faster). + void FsrLfgaHx2(inout FfxFloat16x2 cR,inout FfxFloat16x2 cG,inout FfxFloat16x2 cB,FfxFloat16x2 tR,FfxFloat16x2 tG,FfxFloat16x2 tB,FfxFloat16 a){ + cR+=(tR*FFX_BROADCAST_FLOAT16X2(a))*min(FFX_BROADCAST_FLOAT16X2(1.0)-cR,cR);cG+=(tG*FFX_BROADCAST_FLOAT16X2(a))*min(FFX_BROADCAST_FLOAT16X2(1.0)-cG,cG);cB+=(tB*FFX_BROADCAST_FLOAT16X2(a))*min(FFX_BROADCAST_FLOAT16X2(1.0)-cB,cB);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER +// +//------------------------------------------------------------------------------------------------------------------------------ +// This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear. +// The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering. +//------------------------------------------------------------------------------------------------------------------------------ +// Reversible tonemapper usage, +// FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}. +// FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}. +//============================================================================================================================== +#if defined(FFX_GPU) + void FsrSrtmF(inout FfxFloat32x3 c) + { + c *= ffxBroadcast3(rcp(ffxMax3(c.r, c.g, c.b) + FfxFloat32(1.0))); + } + // The extra max solves the c=1.0 case (which is a /0). + void FsrSrtmInvF(inout FfxFloat32x3 c){c*=ffxBroadcast3(rcp(max(FfxFloat32(1.0/32768.0),FfxFloat32(1.0)-ffxMax3(c.r,c.g,c.b))));} +#endif +//============================================================================================================================== +#if defined(FFX_GPU )&& FFX_HALF == 1 + void FsrSrtmH(inout FfxFloat16x3 c) + { + c *= FFX_BROADCAST_FLOAT16X3(ffxReciprocalHalf(ffxMax3Half(c.r, c.g, c.b) + FFX_BROADCAST_FLOAT16(1.0))); + } + void FsrSrtmInvH(inout FfxFloat16x3 c) + { + c *= FFX_BROADCAST_FLOAT16X3(ffxReciprocalHalf(max(FFX_BROADCAST_FLOAT16(1.0 / 32768.0), FFX_BROADCAST_FLOAT16(1.0) - ffxMax3Half(c.r, c.g, c.b)))); + } + //------------------------------------------------------------------------------------------------------------------------------ + void FsrSrtmHx2(inout FfxFloat16x2 cR, inout FfxFloat16x2 cG, inout FfxFloat16x2 cB) + { + FfxFloat16x2 rcp = ffxReciprocalHalf(ffxMax3Half(cR, cG, cB) + FFX_BROADCAST_FLOAT16X2(1.0)); + cR *= rcp; + cG *= rcp; + cB *= rcp; + } + void FsrSrtmInvHx2(inout FfxFloat16x2 cR,inout FfxFloat16x2 cG,inout FfxFloat16x2 cB) + { + FfxFloat16x2 rcp=ffxReciprocalHalf(max(FFX_BROADCAST_FLOAT16X2(1.0/32768.0),FFX_BROADCAST_FLOAT16X2(1.0)-ffxMax3Half(cR,cG,cB))); + cR*=rcp; + cG*=rcp; + cB*=rcp; + } +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER +// +//------------------------------------------------------------------------------------------------------------------------------ +// Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// Gamma 2.0 is used so that the conversion back to linear is just to square the color. +// The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively. +// Given good non-biased temporal blue noise as dither input, +// the output dither will temporally conserve energy. +// This is done by choosing the linear nearest step point instead of perceptual nearest. +// See code below for details. +//------------------------------------------------------------------------------------------------------------------------------ +// DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION +// =============================================== +// - Output is 'FfxUInt32(floor(saturate(n)*255.0+0.5))'. +// - Thus rounding is to nearest. +// - NaN gets converted to zero. +// - INF is clamped to {0.0 to 1.0}. +//============================================================================================================================== +#if defined(FFX_GPU) + // Hand tuned integer position to dither value, with more values than simple checkerboard. + // Only 32-bit has enough precision for this compddation. + // Output is {0 to <1}. + FfxFloat32 FsrTepdDitF(FfxUInt32x2 p, FfxUInt32 f) + { + FfxFloat32 x = FfxFloat32(p.x + f); + FfxFloat32 y = FfxFloat32(p.y); + // The 1.61803 golden ratio. + FfxFloat32 a = FfxFloat32((1.0 + ffxSqrt(5.0f)) / 2.0); + // Number designed to provide a good visual pattern. + FfxFloat32 b = FfxFloat32(1.0 / 3.69); + x = x * a + (y * b); + return ffxFract(x); + } + //------------------------------------------------------------------------------------------------------------------------------ + // This version is 8-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC8F(inout FfxFloat32x3 c, FfxFloat32 dit) + { + FfxFloat32x3 n = ffxSqrt(c); + n = floor(n * ffxBroadcast3(255.0)) * ffxBroadcast3(1.0 / 255.0); + FfxFloat32x3 a = n * n; + FfxFloat32x3 b = n + ffxBroadcast3(1.0 / 255.0); + b = b * b; + // Ratio of 'a' to 'b' required to produce 'c'. + // ffxApproximateReciprocal() won't work here (at least for very high dynamic ranges). + // ffxApproximateReciprocalMedium() is an IADD,FMA,MUL. + FfxFloat32x3 r = (c - b) * ffxApproximateReciprocalMedium(a - b); + // Use the ratio as a cutoff to choose 'a' or 'b'. + // ffxIsGreaterThanZero() is a MUL. + c = ffxSaturate(n + ffxIsGreaterThanZero(ffxBroadcast3(dit) - r) * ffxBroadcast3(1.0 / 255.0)); + } + //------------------------------------------------------------------------------------------------------------------------------ + // This version is 10-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC10F(inout FfxFloat32x3 c, FfxFloat32 dit) + { + FfxFloat32x3 n = ffxSqrt(c); + n = floor(n * ffxBroadcast3(1023.0)) * ffxBroadcast3(1.0 / 1023.0); + FfxFloat32x3 a = n * n; + FfxFloat32x3 b = n + ffxBroadcast3(1.0 / 1023.0); + b = b * b; + FfxFloat32x3 r = (c - b) * ffxApproximateReciprocalMedium(a - b); + c = ffxSaturate(n + ffxIsGreaterThanZero(ffxBroadcast3(dit) - r) * ffxBroadcast3(1.0 / 1023.0)); + } +#endif +//============================================================================================================================== +#if defined(FFX_GPU)&& FFX_HALF == 1 + FfxFloat16 FsrTepdDitH(FfxUInt32x2 p, FfxUInt32 f) + { + FfxFloat32 x = FfxFloat32(p.x + f); + FfxFloat32 y = FfxFloat32(p.y); + FfxFloat32 a = FfxFloat32((1.0 + ffxSqrt(5.0f)) / 2.0); + FfxFloat32 b = FfxFloat32(1.0 / 3.69); + x = x * a + (y * b); + return FfxFloat16(ffxFract(x)); + } + //------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8H(inout FfxFloat16x3 c, FfxFloat16 dit) + { + FfxFloat16x3 n = sqrt(c); + n = floor(n * FFX_BROADCAST_FLOAT16X3(255.0)) * FFX_BROADCAST_FLOAT16X3(1.0 / 255.0); + FfxFloat16x3 a = n * n; + FfxFloat16x3 b = n + FFX_BROADCAST_FLOAT16X3(1.0 / 255.0); + b = b * b; + FfxFloat16x3 r = (c - b) * ffxApproximateReciprocalMediumHalf(a - b); + c = ffxSaturate(n + ffxIsGreaterThanZeroHalf(FFX_BROADCAST_FLOAT16X3(dit) - r) * FFX_BROADCAST_FLOAT16X3(1.0 / 255.0)); + } + //------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10H(inout FfxFloat16x3 c, FfxFloat16 dit) + { + FfxFloat16x3 n = sqrt(c); + n = floor(n * FFX_BROADCAST_FLOAT16X3(1023.0)) * FFX_BROADCAST_FLOAT16X3(1.0 / 1023.0); + FfxFloat16x3 a = n * n; + FfxFloat16x3 b = n + FFX_BROADCAST_FLOAT16X3(1.0 / 1023.0); + b = b * b; + FfxFloat16x3 r = (c - b) * ffxApproximateReciprocalMediumHalf(a - b); + c = ffxSaturate(n + ffxIsGreaterThanZeroHalf(FFX_BROADCAST_FLOAT16X3(dit) - r) * FFX_BROADCAST_FLOAT16X3(1.0 / 1023.0)); + } + //============================================================================================================================== + // This computes dither for positions 'p' and 'p+{8,0}'. + FfxFloat16x2 FsrTepdDitHx2(FfxUInt32x2 p, FfxUInt32 f) + { + FfxFloat32x2 x; + x.x = FfxFloat32(p.x + f); + x.y = x.x + FfxFloat32(8.0); + FfxFloat32 y = FfxFloat32(p.y); + FfxFloat32 a = FfxFloat32((1.0 + ffxSqrt(5.0f)) / 2.0); + FfxFloat32 b = FfxFloat32(1.0 / 3.69); + x = x * ffxBroadcast2(a) + ffxBroadcast2(y * b); + return FfxFloat16x2(ffxFract(x)); + } + //------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8Hx2(inout FfxFloat16x2 cR, inout FfxFloat16x2 cG, inout FfxFloat16x2 cB, FfxFloat16x2 dit) + { + FfxFloat16x2 nR = sqrt(cR); + FfxFloat16x2 nG = sqrt(cG); + FfxFloat16x2 nB = sqrt(cB); + nR = floor(nR * FFX_BROADCAST_FLOAT16X2(255.0)) * FFX_BROADCAST_FLOAT16X2(1.0 / 255.0); + nG = floor(nG * FFX_BROADCAST_FLOAT16X2(255.0)) * FFX_BROADCAST_FLOAT16X2(1.0 / 255.0); + nB = floor(nB * FFX_BROADCAST_FLOAT16X2(255.0)) * FFX_BROADCAST_FLOAT16X2(1.0 / 255.0); + FfxFloat16x2 aR = nR * nR; + FfxFloat16x2 aG = nG * nG; + FfxFloat16x2 aB = nB * nB; + FfxFloat16x2 bR = nR + FFX_BROADCAST_FLOAT16X2(1.0 / 255.0); + bR = bR * bR; + FfxFloat16x2 bG = nG + FFX_BROADCAST_FLOAT16X2(1.0 / 255.0); + bG = bG * bG; + FfxFloat16x2 bB = nB + FFX_BROADCAST_FLOAT16X2(1.0 / 255.0); + bB = bB * bB; + FfxFloat16x2 rR = (cR - bR) * ffxApproximateReciprocalMediumHalf(aR - bR); + FfxFloat16x2 rG = (cG - bG) * ffxApproximateReciprocalMediumHalf(aG - bG); + FfxFloat16x2 rB = (cB - bB) * ffxApproximateReciprocalMediumHalf(aB - bB); + cR = ffxSaturate(nR + ffxIsGreaterThanZeroHalf(dit - rR) * FFX_BROADCAST_FLOAT16X2(1.0 / 255.0)); + cG = ffxSaturate(nG + ffxIsGreaterThanZeroHalf(dit - rG) * FFX_BROADCAST_FLOAT16X2(1.0 / 255.0)); + cB = ffxSaturate(nB + ffxIsGreaterThanZeroHalf(dit - rB) * FFX_BROADCAST_FLOAT16X2(1.0 / 255.0)); + } + //------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10Hx2(inout FfxFloat16x2 cR,inout FfxFloat16x2 cG,inout FfxFloat16x2 cB,FfxFloat16x2 dit){ + FfxFloat16x2 nR=sqrt(cR); + FfxFloat16x2 nG=sqrt(cG); + FfxFloat16x2 nB=sqrt(cB); + nR=floor(nR*FFX_BROADCAST_FLOAT16X2(1023.0))*FFX_BROADCAST_FLOAT16X2(1.0/1023.0); + nG=floor(nG*FFX_BROADCAST_FLOAT16X2(1023.0))*FFX_BROADCAST_FLOAT16X2(1.0/1023.0); + nB=floor(nB*FFX_BROADCAST_FLOAT16X2(1023.0))*FFX_BROADCAST_FLOAT16X2(1.0/1023.0); + FfxFloat16x2 aR=nR*nR; + FfxFloat16x2 aG=nG*nG; + FfxFloat16x2 aB=nB*nB; + FfxFloat16x2 bR=nR+FFX_BROADCAST_FLOAT16X2(1.0/1023.0);bR=bR*bR; + FfxFloat16x2 bG=nG+FFX_BROADCAST_FLOAT16X2(1.0/1023.0);bG=bG*bG; + FfxFloat16x2 bB=nB+FFX_BROADCAST_FLOAT16X2(1.0/1023.0);bB=bB*bB; + FfxFloat16x2 rR=(cR-bR)*ffxApproximateReciprocalMediumHalf(aR-bR); + FfxFloat16x2 rG=(cG-bG)*ffxApproximateReciprocalMediumHalf(aG-bG); + FfxFloat16x2 rB=(cB-bB)*ffxApproximateReciprocalMediumHalf(aB-bB); + cR=ffxSaturate(nR+ffxIsGreaterThanZeroHalf(dit-rR)*FFX_BROADCAST_FLOAT16X2(1.0/1023.0)); + cG=ffxSaturate(nG+ffxIsGreaterThanZeroHalf(dit-rG)*FFX_BROADCAST_FLOAT16X2(1.0/1023.0)); + cB = ffxSaturate(nB + ffxIsGreaterThanZeroHalf(dit - rB) * FFX_BROADCAST_FLOAT16X2(1.0 / 1023.0)); +} +#endif diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate.h new file mode 100644 index 00000000000..7bd5892cb90 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate.h @@ -0,0 +1,295 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef FFX_FSR2_ACCUMULATE_H +#define FFX_FSR2_ACCUMULATE_H + +FfxFloat32 GetPxHrVelocity(FfxFloat32x2 fMotionVector) +{ + return length(fMotionVector * DisplaySize()); +} +#if FFX_HALF +FFX_MIN16_F GetPxHrVelocity(FFX_MIN16_F2 fMotionVector) +{ + return length(fMotionVector * FFX_MIN16_F2(DisplaySize())); +} +#endif + +void Accumulate(const AccumulationPassCommonParams params, FFX_PARAMETER_INOUT FfxFloat32x3 fHistoryColor, FfxFloat32x3 fAccumulation, FFX_PARAMETER_IN FfxFloat32x4 fUpsampledColorAndWeight) +{ + // Aviod invalid values when accumulation and upsampled weight is 0 + fAccumulation = ffxMax(FSR2_EPSILON.xxx, fAccumulation + fUpsampledColorAndWeight.www); + +#if FFX_FSR2_OPTION_HDR_COLOR_INPUT + //YCoCg -> RGB -> Tonemap -> YCoCg (Use RGB tonemapper to avoid color desaturation) + fUpsampledColorAndWeight.xyz = RGBToYCoCg(Tonemap(YCoCgToRGB(fUpsampledColorAndWeight.xyz))); + fHistoryColor = RGBToYCoCg(Tonemap(YCoCgToRGB(fHistoryColor))); +#endif + + const FfxFloat32x3 fAlpha = fUpsampledColorAndWeight.www / fAccumulation; + fHistoryColor = ffxLerp(fHistoryColor, fUpsampledColorAndWeight.xyz, fAlpha); + + fHistoryColor = YCoCgToRGB(fHistoryColor); + +#if FFX_FSR2_OPTION_HDR_COLOR_INPUT + fHistoryColor = InverseTonemap(fHistoryColor); +#endif +} + +void RectifyHistory( + const AccumulationPassCommonParams params, + RectificationBox clippingBox, + FFX_PARAMETER_INOUT FfxFloat32x3 fHistoryColor, + FFX_PARAMETER_INOUT FfxFloat32x3 fAccumulation, + FfxFloat32 fLockContributionThisFrame, + FfxFloat32 fTemporalReactiveFactor, + FfxFloat32 fLumaInstabilityFactor) +{ + FfxFloat32 fScaleFactorInfluence = ffxMin(20.0f, ffxPow(FfxFloat32(1.0f / length(DownscaleFactor().x * DownscaleFactor().y)), 3.0f)); + + const FfxFloat32 fVecolityFactor = ffxSaturate(params.fHrVelocity / 20.0f); + const FfxFloat32 fBoxScaleT = ffxMax(params.fDepthClipFactor, ffxMax(params.fAccumulationMask, fVecolityFactor)); + FfxFloat32 fBoxScale = ffxLerp(fScaleFactorInfluence, 1.0f, fBoxScaleT); + + FfxFloat32x3 fScaledBoxVec = clippingBox.boxVec * fBoxScale; + FfxFloat32x3 boxMin = clippingBox.boxCenter - fScaledBoxVec; + FfxFloat32x3 boxMax = clippingBox.boxCenter + fScaledBoxVec; + FfxFloat32x3 boxCenter = clippingBox.boxCenter; + FfxFloat32 boxVecSize = length(clippingBox.boxVec); + + boxMin = ffxMax(clippingBox.aabbMin, boxMin); + boxMax = ffxMin(clippingBox.aabbMax, boxMax); + + if (any(FFX_GREATER_THAN(boxMin, fHistoryColor)) || any(FFX_GREATER_THAN(fHistoryColor, boxMax))) { + + const FfxFloat32x3 fClampedHistoryColor = clamp(fHistoryColor, boxMin, boxMax); + + FfxFloat32x3 fHistoryContribution = ffxMax(fLumaInstabilityFactor, fLockContributionThisFrame).xxx; + + const FfxFloat32 fReactiveFactor = params.fDilatedReactiveFactor; + const FfxFloat32 fReactiveContribution = 1.0f - ffxPow(fReactiveFactor, 1.0f / 2.0f); + fHistoryContribution *= fReactiveContribution; + + // Scale history color using rectification info, also using accumulation mask to avoid potential invalid color protection + fHistoryColor = ffxLerp(fClampedHistoryColor, fHistoryColor, ffxSaturate(fHistoryContribution)); + + // Scale accumulation using rectification info + const FfxFloat32x3 fAccumulationMin = ffxMin(fAccumulation, FFX_BROADCAST_FLOAT32X3(0.1f)); + fAccumulation = ffxLerp(fAccumulationMin, fAccumulation, ffxSaturate(fHistoryContribution)); + } +} + +void WriteUpscaledOutput(FfxInt32x2 iPxHrPos, FfxFloat32x3 fUpscaledColor) +{ + StoreUpscaledOutput(iPxHrPos, fUpscaledColor); +} + +void FinalizeLockStatus(const AccumulationPassCommonParams params, FfxFloat32x2 fLockStatus, FfxFloat32 fUpsampledWeight) +{ + // we expect similar motion for next frame + // kill lock if that location is outside screen, avoid locks to be clamped to screen borders + FfxFloat32x2 fEstimatedUvNextFrame = params.fHrUv - params.fMotionVector; + if (IsUvInside(fEstimatedUvNextFrame) == false) { + KillLock(fLockStatus); + } + else { + // Decrease lock lifetime + const FfxFloat32 fLifetimeDecreaseLanczosMax = FfxFloat32(JitterSequenceLength()) * FfxFloat32(fAverageLanczosWeightPerFrame); + const FfxFloat32 fLifetimeDecrease = FfxFloat32(fUpsampledWeight / fLifetimeDecreaseLanczosMax); + fLockStatus[LOCK_LIFETIME_REMAINING] = ffxMax(FfxFloat32(0), fLockStatus[LOCK_LIFETIME_REMAINING] - fLifetimeDecrease); + } + + StoreLockStatus(params.iPxHrPos, fLockStatus); +} + + +FfxFloat32x3 ComputeBaseAccumulationWeight(const AccumulationPassCommonParams params, FfxFloat32 fThisFrameReactiveFactor, FfxBoolean bInMotionLastFrame, FfxFloat32 fUpsampledWeight, LockState lockState) +{ + // Always assume max accumulation was reached + FfxFloat32 fBaseAccumulation = fMaxAccumulationLanczosWeight * FfxFloat32(params.bIsExistingSample) * (1.0f - fThisFrameReactiveFactor) * (1.0f - params.fDepthClipFactor); + + fBaseAccumulation = ffxMin(fBaseAccumulation, ffxLerp(fBaseAccumulation, fUpsampledWeight * 10.0f, ffxMax(FfxFloat32(bInMotionLastFrame), ffxSaturate(params.fHrVelocity * FfxFloat32(10))))); + + fBaseAccumulation = ffxMin(fBaseAccumulation, ffxLerp(fBaseAccumulation, fUpsampledWeight, ffxSaturate(params.fHrVelocity / FfxFloat32(20)))); + + return fBaseAccumulation.xxx; +} + +FfxFloat32 ComputeLumaInstabilityFactor(const AccumulationPassCommonParams params, RectificationBox clippingBox, FfxFloat32 fThisFrameReactiveFactor, FfxFloat32 fLuminanceDiff) +{ + const FfxFloat32 fUnormThreshold = 1.0f / 255.0f; + const FfxInt32 N_MINUS_1 = 0; + const FfxInt32 N_MINUS_2 = 1; + const FfxInt32 N_MINUS_3 = 2; + const FfxInt32 N_MINUS_4 = 3; + + FfxFloat32 fCurrentFrameLuma = clippingBox.boxCenter.x; + +#if FFX_FSR2_OPTION_HDR_COLOR_INPUT + fCurrentFrameLuma = fCurrentFrameLuma / (1.0f + ffxMax(0.0f, fCurrentFrameLuma)); +#endif + + fCurrentFrameLuma = round(fCurrentFrameLuma * 255.0f) / 255.0f; + + const FfxBoolean bSampleLumaHistory = (ffxMax(ffxMax(params.fDepthClipFactor, params.fAccumulationMask), fLuminanceDiff) < 0.1f) && (params.bIsNewSample == false); + FfxFloat32x4 fCurrentFrameLumaHistory = bSampleLumaHistory ? SampleLumaHistory(params.fReprojectedHrUv) : FFX_BROADCAST_FLOAT32X4(0.0f); + + FfxFloat32 fLumaInstability = 0.0f; + FfxFloat32 fDiffs0 = (fCurrentFrameLuma - fCurrentFrameLumaHistory[N_MINUS_1]); + + FfxFloat32 fMin = abs(fDiffs0); + + if (fMin >= fUnormThreshold) + { + for (int i = N_MINUS_2; i <= N_MINUS_4; i++) { + FfxFloat32 fDiffs1 = (fCurrentFrameLuma - fCurrentFrameLumaHistory[i]); + + if (sign(fDiffs0) == sign(fDiffs1)) { + + // Scale difference to protect historically similar values + const FfxFloat32 fMinBias = 1.0f; + fMin = ffxMin(fMin, abs(fDiffs1) * fMinBias); + } + } + + const FfxFloat32 fBoxSize = clippingBox.boxVec.x; + const FfxFloat32 fBoxSizeFactor = ffxPow(ffxSaturate(fBoxSize / 0.1f), 6.0f); + + fLumaInstability = FfxFloat32(fMin != abs(fDiffs0)) * fBoxSizeFactor; + fLumaInstability = FfxFloat32(fLumaInstability > fUnormThreshold); + + fLumaInstability *= 1.0f - ffxMax(params.fAccumulationMask, ffxPow(fThisFrameReactiveFactor, 1.0f / 6.0f)); + } + + //shift history + fCurrentFrameLumaHistory[N_MINUS_4] = fCurrentFrameLumaHistory[N_MINUS_3]; + fCurrentFrameLumaHistory[N_MINUS_3] = fCurrentFrameLumaHistory[N_MINUS_2]; + fCurrentFrameLumaHistory[N_MINUS_2] = fCurrentFrameLumaHistory[N_MINUS_1]; + fCurrentFrameLumaHistory[N_MINUS_1] = fCurrentFrameLuma; + + StoreLumaHistory(params.iPxHrPos, fCurrentFrameLumaHistory); + + return fLumaInstability * FfxFloat32(fCurrentFrameLumaHistory[N_MINUS_4] != 0); +} + +FfxFloat32 ComputeTemporalReactiveFactor(const AccumulationPassCommonParams params, FfxFloat32 fTemporalReactiveFactor) +{ + FfxFloat32 fNewFactor = ffxMin(0.99f, fTemporalReactiveFactor); + + fNewFactor = ffxMax(fNewFactor, ffxLerp(fNewFactor, 0.4f, ffxSaturate(params.fHrVelocity))); + + fNewFactor = ffxMax(fNewFactor * fNewFactor, ffxMax(params.fDepthClipFactor * 0.1f, params.fDilatedReactiveFactor)); + + // Force reactive factor for new samples + fNewFactor = params.bIsNewSample ? 1.0f : fNewFactor; + + if (ffxSaturate(params.fHrVelocity * 10.0f) >= 1.0f) { + fNewFactor = ffxMax(FSR2_EPSILON, fNewFactor) * -1.0f; + } + + return fNewFactor; +} + +AccumulationPassCommonParams InitParams(FfxInt32x2 iPxHrPos) +{ + AccumulationPassCommonParams params; + + params.iPxHrPos = iPxHrPos; + const FfxFloat32x2 fHrUv = (iPxHrPos + 0.5f) / DisplaySize(); + params.fHrUv = fHrUv; + + const FfxFloat32x2 fLrUvJittered = fHrUv + Jitter() / RenderSize(); + params.fLrUv_HwSampler = ClampUv(fLrUvJittered, RenderSize(), MaxRenderSize()); + + params.fMotionVector = GetMotionVector(iPxHrPos, fHrUv); + params.fHrVelocity = GetPxHrVelocity(params.fMotionVector); + + ComputeReprojectedUVs(params, params.fReprojectedHrUv, params.bIsExistingSample); + + params.fDepthClipFactor = ffxSaturate(SampleDepthClip(params.fLrUv_HwSampler)); + + const FfxFloat32x2 fDilatedReactiveMasks = SampleDilatedReactiveMasks(params.fLrUv_HwSampler); + params.fDilatedReactiveFactor = fDilatedReactiveMasks.x; + params.fAccumulationMask = fDilatedReactiveMasks.y; + params.bIsResetFrame = (0 == FrameIndex()); + + params.bIsNewSample = (params.bIsExistingSample == false || params.bIsResetFrame); + + return params; +} + +void Accumulate(FfxInt32x2 iPxHrPos) +{ + const AccumulationPassCommonParams params = InitParams(iPxHrPos); + + FfxFloat32x3 fHistoryColor = FfxFloat32x3(0, 0, 0); + FfxFloat32x2 fLockStatus; + InitializeNewLockSample(fLockStatus); + + FfxFloat32 fTemporalReactiveFactor = 0.0f; + FfxBoolean bInMotionLastFrame = FFX_FALSE; + LockState lockState = { FFX_FALSE , FFX_FALSE }; + if (params.bIsExistingSample && !params.bIsResetFrame) { + ReprojectHistoryColor(params, fHistoryColor, fTemporalReactiveFactor, bInMotionLastFrame); + lockState = ReprojectHistoryLockStatus(params, fLockStatus); + } + + FfxFloat32 fThisFrameReactiveFactor = ffxMax(params.fDilatedReactiveFactor, fTemporalReactiveFactor); + + FfxFloat32 fLuminanceDiff = 0.0f; + FfxFloat32 fLockContributionThisFrame = 0.0f; + UpdateLockStatus(params, fThisFrameReactiveFactor, lockState, fLockStatus, fLockContributionThisFrame, fLuminanceDiff); + + // Load upsampled input color + RectificationBox clippingBox; + FfxFloat32x4 fUpsampledColorAndWeight = ComputeUpsampledColorAndWeight(params, clippingBox, fThisFrameReactiveFactor); + + const FfxFloat32 fLumaInstabilityFactor = ComputeLumaInstabilityFactor(params, clippingBox, fThisFrameReactiveFactor, fLuminanceDiff); + + + FfxFloat32x3 fAccumulation = ComputeBaseAccumulationWeight(params, fThisFrameReactiveFactor, bInMotionLastFrame, fUpsampledColorAndWeight.w, lockState); + + if (params.bIsNewSample) { + fHistoryColor = YCoCgToRGB(fUpsampledColorAndWeight.xyz); + } + else { + RectifyHistory(params, clippingBox, fHistoryColor, fAccumulation, fLockContributionThisFrame, fThisFrameReactiveFactor, fLumaInstabilityFactor); + + Accumulate(params, fHistoryColor, fAccumulation, fUpsampledColorAndWeight); + } + + fHistoryColor = UnprepareRgb(fHistoryColor, Exposure()); + + FinalizeLockStatus(params, fLockStatus, fUpsampledColorAndWeight.w); + + // Get new temporal reactive factor + fTemporalReactiveFactor = ComputeTemporalReactiveFactor(params, fThisFrameReactiveFactor); + + StoreInternalColorAndWeight(iPxHrPos, FfxFloat32x4(fHistoryColor, fTemporalReactiveFactor)); + + // Output final color when RCAS is disabled +#if FFX_FSR2_OPTION_APPLY_SHARPENING == 0 + WriteUpscaledOutput(iPxHrPos, fHistoryColor); +#endif + StoreNewLocks(iPxHrPos, 0); +} + +#endif // FFX_FSR2_ACCUMULATE_H diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl new file mode 100644 index 00000000000..d2306fec4c1 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl @@ -0,0 +1,92 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + + +#extension GL_GOOGLE_include_directive : require +#extension GL_EXT_samplerless_texture_functions : require +// Needed for rw_upscaled_output declaration +#extension GL_EXT_shader_image_load_formatted : require + +#define FSR2_BIND_SRV_INPUT_EXPOSURE 0 +#define FSR2_BIND_SRV_DILATED_REACTIVE_MASKS 1 +#if FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS +#define FSR2_BIND_SRV_DILATED_MOTION_VECTORS 2 +#else +#define FSR2_BIND_SRV_INPUT_MOTION_VECTORS 2 +#endif +#define FSR2_BIND_SRV_INTERNAL_UPSCALED 3 +#define FSR2_BIND_SRV_LOCK_STATUS 4 +#define FSR2_BIND_SRV_PREPARED_INPUT_COLOR 6 +#define FSR2_BIND_SRV_LUMA_INSTABILITY 7 +#define FSR2_BIND_SRV_LANCZOS_LUT 8 +#define FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT 9 +#define FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS 10 +#define FSR2_BIND_SRV_AUTO_EXPOSURE 11 +#define FSR2_BIND_SRV_LUMA_HISTORY 12 + +#define FSR2_BIND_UAV_INTERNAL_UPSCALED 13 +#define FSR2_BIND_UAV_LOCK_STATUS 14 +#define FSR2_BIND_UAV_UPSCALED_OUTPUT 15 +#define FSR2_BIND_UAV_NEW_LOCKS 16 +#define FSR2_BIND_UAV_LUMA_HISTORY 17 + +#define FSR2_BIND_CB_FSR2 18 + +// -- GODOT start -- +#if FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS +#define FSR2_BIND_SRV_INPUT_DEPTH 5 +#endif +// -- GODOT end -- + +#include "ffx_fsr2_callbacks_glsl.h" +#include "ffx_fsr2_common.h" +#include "ffx_fsr2_sample.h" +#include "ffx_fsr2_upsample.h" +#include "ffx_fsr2_postprocess_lock_status.h" +#include "ffx_fsr2_reproject.h" +#include "ffx_fsr2_accumulate.h" + +#ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#define FFX_FSR2_THREAD_GROUP_WIDTH 8 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT +#define FFX_FSR2_THREAD_GROUP_HEIGHT 8 +#endif // FFX_FSR2_THREAD_GROUP_HEIGHT +#ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#define FFX_FSR2_THREAD_GROUP_DEPTH 1 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#ifndef FFX_FSR2_NUM_THREADS +#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in; + +#endif // #ifndef FFX_FSR2_NUM_THREADS + +FFX_FSR2_NUM_THREADS +void main() +{ + uvec2 uGroupId = gl_WorkGroupID.xy; + const uint GroupRows = (uint(DisplaySize().y) + FFX_FSR2_THREAD_GROUP_HEIGHT - 1) / FFX_FSR2_THREAD_GROUP_HEIGHT; + uGroupId.y = GroupRows - uGroupId.y - 1; + + uvec2 uDispatchThreadId = uGroupId * uvec2(FFX_FSR2_THREAD_GROUP_WIDTH, FFX_FSR2_THREAD_GROUP_HEIGHT) + gl_LocalInvocationID.xy; + + Accumulate(ivec2(uDispatchThreadId)); +} \ No newline at end of file diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_autogen_reactive_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_autogen_reactive_pass.glsl new file mode 100644 index 00000000000..e62b4459246 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_autogen_reactive_pass.glsl @@ -0,0 +1,93 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + + +#extension GL_GOOGLE_include_directive : require +#extension GL_EXT_samplerless_texture_functions : require + +#define FSR2_BIND_SRV_INPUT_OPAQUE_ONLY 0 +#define FSR2_BIND_SRV_INPUT_COLOR 1 +#define FSR2_BIND_UAV_AUTOREACTIVE 2 +#define FSR2_BIND_CB_REACTIVE 3 +#define FSR2_BIND_CB_FSR2 4 + +#include "ffx_fsr2_callbacks_glsl.h" +#include "ffx_fsr2_common.h" + +// layout (set = 1, binding = FSR2_BIND_SRV_PRE_ALPHA_COLOR) uniform texture2D r_input_color_pre_alpha; +// layout (set = 1, binding = FSR2_BIND_SRV_POST_ALPHA_COLOR) uniform texture2D r_input_color_post_alpha; +// layout (set = 1, binding = FSR2_BIND_UAV_REACTIVE, r8) uniform image2D rw_output_reactive_mask; + + +#ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#define FFX_FSR2_THREAD_GROUP_WIDTH 8 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT +#define FFX_FSR2_THREAD_GROUP_HEIGHT 8 +#endif // FFX_FSR2_THREAD_GROUP_HEIGHT +#ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#define FFX_FSR2_THREAD_GROUP_DEPTH 1 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#ifndef FFX_FSR2_NUM_THREADS +#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in; +#endif // #ifndef FFX_FSR2_NUM_THREADS + +#if defined(FSR2_BIND_CB_REACTIVE) +layout (set = 1, binding = FSR2_BIND_CB_REACTIVE, std140) uniform cbGenerateReactive_t +{ + float scale; + float threshold; + float binaryValue; + uint flags; +} cbGenerateReactive; +#endif + +FFX_FSR2_NUM_THREADS +void main() +{ + FfxUInt32x2 uDispatchThreadId = gl_GlobalInvocationID.xy; + + FfxFloat32x3 ColorPreAlpha = LoadOpaqueOnly(FFX_MIN16_I2(uDispatchThreadId)).rgb; + FfxFloat32x3 ColorPostAlpha = LoadInputColor(FFX_MIN16_I2(uDispatchThreadId)).rgb; + + if ((cbGenerateReactive.flags & FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_TONEMAP) != 0) + { + ColorPreAlpha = Tonemap(ColorPreAlpha); + ColorPostAlpha = Tonemap(ColorPostAlpha); + } + + if ((cbGenerateReactive.flags & FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_INVERSETONEMAP) != 0) + { + ColorPreAlpha = InverseTonemap(ColorPreAlpha); + ColorPostAlpha = InverseTonemap(ColorPostAlpha); + } + + FfxFloat32 out_reactive_value = 0.f; + FfxFloat32x3 delta = abs(ColorPostAlpha - ColorPreAlpha); + + out_reactive_value = ((cbGenerateReactive.flags & FFX_FSR2_AUTOREACTIVEFLAGS_USE_COMPONENTS_MAX)!=0) ? max(delta.x, max(delta.y, delta.z)) : length(delta); + out_reactive_value *= cbGenerateReactive.scale; + + out_reactive_value = ((cbGenerateReactive.flags & FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_THRESHOLD)!=0) ? ((out_reactive_value < cbGenerateReactive.threshold) ? 0 : cbGenerateReactive.binaryValue) : out_reactive_value; + + imageStore(rw_output_autoreactive, FfxInt32x2(uDispatchThreadId), vec4(out_reactive_value)); +} diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_glsl.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_glsl.h new file mode 100644 index 00000000000..45279bd3574 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_glsl.h @@ -0,0 +1,704 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +#include "ffx_fsr2_resources.h" + +#if defined(FFX_GPU) +#include "ffx_core.h" +#endif // #if defined(FFX_GPU) + +#if defined(FFX_GPU) +#ifndef FFX_FSR2_PREFER_WAVE64 +#define FFX_FSR2_PREFER_WAVE64 +#endif // #if defined(FFX_GPU) + +#if defined(FSR2_BIND_CB_FSR2) + layout (set = 1, binding = FSR2_BIND_CB_FSR2, std140) uniform cbFSR2_t + { + FfxInt32x2 iRenderSize; + FfxInt32x2 iMaxRenderSize; + FfxInt32x2 iDisplaySize; + FfxInt32x2 iInputColorResourceDimensions; + FfxInt32x2 iLumaMipDimensions; + FfxInt32 iLumaMipLevelToUse; + FfxInt32 iFrameIndex; + + FfxFloat32x4 fDeviceToViewDepth; + FfxFloat32x2 fJitter; + FfxFloat32x2 fMotionVectorScale; + FfxFloat32x2 fDownscaleFactor; + FfxFloat32x2 fMotionVectorJitterCancellation; + FfxFloat32 fPreExposure; + FfxFloat32 fPreviousFramePreExposure; + FfxFloat32 fTanHalfFOV; + FfxFloat32 fJitterSequenceLength; + FfxFloat32 fDeltaTime; + FfxFloat32 fDynamicResChangeFactor; + FfxFloat32 fViewSpaceToMetersFactor; + + // -- GODOT start -- + FfxFloat32 fPad; + mat4 mReprojectionMatrix; + // -- GODOT end -- + } cbFSR2; +#endif + +FfxInt32x2 RenderSize() +{ + return cbFSR2.iRenderSize; +} + +FfxInt32x2 MaxRenderSize() +{ + return cbFSR2.iMaxRenderSize; +} + +FfxInt32x2 DisplaySize() +{ + return cbFSR2.iDisplaySize; +} + +FfxInt32x2 InputColorResourceDimensions() +{ + return cbFSR2.iInputColorResourceDimensions; +} + +FfxInt32x2 LumaMipDimensions() +{ + return cbFSR2.iLumaMipDimensions; +} + +FfxInt32 LumaMipLevelToUse() +{ + return cbFSR2.iLumaMipLevelToUse; +} + +FfxInt32 FrameIndex() +{ + return cbFSR2.iFrameIndex; +} + +FfxFloat32x4 DeviceToViewSpaceTransformFactors() +{ + return cbFSR2.fDeviceToViewDepth; +} + +FfxFloat32x2 Jitter() +{ + return cbFSR2.fJitter; +} + +FfxFloat32x2 MotionVectorScale() +{ + return cbFSR2.fMotionVectorScale; +} + +FfxFloat32x2 DownscaleFactor() +{ + return cbFSR2.fDownscaleFactor; +} + +FfxFloat32x2 MotionVectorJitterCancellation() +{ + return cbFSR2.fMotionVectorJitterCancellation; +} + +FfxFloat32 PreExposure() +{ + return cbFSR2.fPreExposure; +} + +FfxFloat32 PreviousFramePreExposure() +{ + return cbFSR2.fPreviousFramePreExposure; +} + +FfxFloat32 TanHalfFoV() +{ + return cbFSR2.fTanHalfFOV; +} + +FfxFloat32 JitterSequenceLength() +{ + return cbFSR2.fJitterSequenceLength; +} + +FfxFloat32 DeltaTime() +{ + return cbFSR2.fDeltaTime; +} + +FfxFloat32 DynamicResChangeFactor() +{ + return cbFSR2.fDynamicResChangeFactor; +} + +FfxFloat32 ViewSpaceToMetersFactor() +{ + return cbFSR2.fViewSpaceToMetersFactor; +} + +layout (set = 0, binding = 0) uniform sampler s_PointClamp; +layout (set = 0, binding = 1) uniform sampler s_LinearClamp; + +// SRVs +#if defined(FSR2_BIND_SRV_INPUT_OPAQUE_ONLY) + layout (set = 1, binding = FSR2_BIND_SRV_INPUT_OPAQUE_ONLY) uniform texture2D r_input_opaque_only; +#endif +#if defined(FSR2_BIND_SRV_INPUT_COLOR) + layout (set = 1, binding = FSR2_BIND_SRV_INPUT_COLOR) uniform texture2D r_input_color_jittered; +#endif +#if defined(FSR2_BIND_SRV_INPUT_MOTION_VECTORS) + layout (set = 1, binding = FSR2_BIND_SRV_INPUT_MOTION_VECTORS) uniform texture2D r_input_motion_vectors; +#endif +#if defined(FSR2_BIND_SRV_INPUT_DEPTH) + layout (set = 1, binding = FSR2_BIND_SRV_INPUT_DEPTH) uniform texture2D r_input_depth; +#endif +#if defined(FSR2_BIND_SRV_INPUT_EXPOSURE) + layout (set = 1, binding = FSR2_BIND_SRV_INPUT_EXPOSURE) uniform texture2D r_input_exposure; +#endif +#if defined(FSR2_BIND_SRV_AUTO_EXPOSURE) + layout(set = 1, binding = FSR2_BIND_SRV_AUTO_EXPOSURE) uniform texture2D r_auto_exposure; +#endif +#if defined(FSR2_BIND_SRV_REACTIVE_MASK) + layout (set = 1, binding = FSR2_BIND_SRV_REACTIVE_MASK) uniform texture2D r_reactive_mask; +#endif +#if defined(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK) + layout (set = 1, binding = FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK) uniform texture2D r_transparency_and_composition_mask; +#endif +#if defined(FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH) + layout (set = 1, binding = FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH) uniform utexture2D r_reconstructed_previous_nearest_depth; +#endif +#if defined(FSR2_BIND_SRV_DILATED_MOTION_VECTORS) + layout (set = 1, binding = FSR2_BIND_SRV_DILATED_MOTION_VECTORS) uniform texture2D r_dilated_motion_vectors; +#endif +#if defined (FSR2_BIND_SRV_PREVIOUS_DILATED_MOTION_VECTORS) + layout(set = 1, binding = FSR2_BIND_SRV_PREVIOUS_DILATED_MOTION_VECTORS) uniform texture2D r_previous_dilated_motion_vectors; +#endif +#if defined(FSR2_BIND_SRV_DILATED_DEPTH) + layout (set = 1, binding = FSR2_BIND_SRV_DILATED_DEPTH) uniform texture2D r_dilatedDepth; +#endif +#if defined(FSR2_BIND_SRV_INTERNAL_UPSCALED) + layout (set = 1, binding = FSR2_BIND_SRV_INTERNAL_UPSCALED) uniform texture2D r_internal_upscaled_color; +#endif +#if defined(FSR2_BIND_SRV_LOCK_STATUS) + layout (set = 1, binding = FSR2_BIND_SRV_LOCK_STATUS) uniform texture2D r_lock_status; +#endif +#if defined(FSR2_BIND_SRV_LOCK_INPUT_LUMA) + layout (set = 1, binding = FSR2_BIND_SRV_LOCK_INPUT_LUMA) uniform texture2D r_lock_input_luma; +#endif +#if defined(FSR2_BIND_SRV_NEW_LOCKS) + layout(set = 1, binding = FSR2_BIND_SRV_NEW_LOCKS) uniform texture2D r_new_locks; +#endif +#if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR) + layout (set = 1, binding = FSR2_BIND_SRV_PREPARED_INPUT_COLOR) uniform texture2D r_prepared_input_color; +#endif +#if defined(FSR2_BIND_SRV_LUMA_HISTORY) + layout (set = 1, binding = FSR2_BIND_SRV_LUMA_HISTORY) uniform texture2D r_luma_history; +#endif +#if defined(FSR2_BIND_SRV_RCAS_INPUT) + layout (set = 1, binding = FSR2_BIND_SRV_RCAS_INPUT) uniform texture2D r_rcas_input; +#endif +#if defined(FSR2_BIND_SRV_LANCZOS_LUT) + layout (set = 1, binding = FSR2_BIND_SRV_LANCZOS_LUT) uniform texture2D r_lanczos_lut; +#endif +#if defined(FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS) + layout (set = 1, binding = FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS) uniform texture2D r_imgMips; +#endif +#if defined(FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT) + layout (set = 1, binding = FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT) uniform texture2D r_upsample_maximum_bias_lut; +#endif +#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS) + layout (set = 1, binding = FSR2_BIND_SRV_DILATED_REACTIVE_MASKS) uniform texture2D r_dilated_reactive_masks; +#endif +#if defined(FSR2_BIND_SRV_PREV_PRE_ALPHA_COLOR) + layout(set = 1, binding = FSR2_BIND_SRV_PREV_PRE_ALPHA_COLOR) uniform texture2D r_input_prev_color_pre_alpha; +#endif +#if defined(FSR2_BIND_SRV_PREV_POST_ALPHA_COLOR) + layout(set = 1, binding = FSR2_BIND_SRV_PREV_POST_ALPHA_COLOR) uniform texture2D r_input_prev_color_post_alpha; +#endif + +// UAV +#if defined FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH + layout (set = 1, binding = FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH, r32ui) uniform uimage2D rw_reconstructed_previous_nearest_depth; +#endif +#if defined FSR2_BIND_UAV_DILATED_MOTION_VECTORS + layout (set = 1, binding = FSR2_BIND_UAV_DILATED_MOTION_VECTORS, rg16f) writeonly uniform image2D rw_dilated_motion_vectors; +#endif +#if defined FSR2_BIND_UAV_DILATED_DEPTH + layout (set = 1, binding = FSR2_BIND_UAV_DILATED_DEPTH, r16f) writeonly uniform image2D rw_dilatedDepth; +#endif +#if defined FSR2_BIND_UAV_INTERNAL_UPSCALED + layout (set = 1, binding = FSR2_BIND_UAV_INTERNAL_UPSCALED, rgba16f) writeonly uniform image2D rw_internal_upscaled_color; +#endif +#if defined FSR2_BIND_UAV_LOCK_STATUS + layout (set = 1, binding = FSR2_BIND_UAV_LOCK_STATUS, rg16f) uniform image2D rw_lock_status; +#endif +#if defined(FSR2_BIND_UAV_LOCK_INPUT_LUMA) + layout(set = 1, binding = FSR2_BIND_UAV_LOCK_INPUT_LUMA, r16f) writeonly uniform image2D rw_lock_input_luma; +#endif +#if defined FSR2_BIND_UAV_NEW_LOCKS + layout(set = 1, binding = FSR2_BIND_UAV_NEW_LOCKS, r8) uniform image2D rw_new_locks; +#endif +#if defined FSR2_BIND_UAV_PREPARED_INPUT_COLOR + layout (set = 1, binding = FSR2_BIND_UAV_PREPARED_INPUT_COLOR, rgba16) writeonly uniform image2D rw_prepared_input_color; +#endif +#if defined FSR2_BIND_UAV_LUMA_HISTORY + layout (set = 1, binding = FSR2_BIND_UAV_LUMA_HISTORY, rgba8) uniform image2D rw_luma_history; +#endif +#if defined FSR2_BIND_UAV_UPSCALED_OUTPUT + layout (set = 1, binding = FSR2_BIND_UAV_UPSCALED_OUTPUT /* app controlled format */) writeonly uniform image2D rw_upscaled_output; +#endif +#if defined FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE + layout (set = 1, binding = FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE, r16f) coherent uniform image2D rw_img_mip_shading_change; +#endif +#if defined FSR2_BIND_UAV_EXPOSURE_MIP_5 + layout (set = 1, binding = FSR2_BIND_UAV_EXPOSURE_MIP_5, r16f) coherent uniform image2D rw_img_mip_5; +#endif +#if defined FSR2_BIND_UAV_DILATED_REACTIVE_MASKS + layout (set = 1, binding = FSR2_BIND_UAV_DILATED_REACTIVE_MASKS, rg8) writeonly uniform image2D rw_dilated_reactive_masks; +#endif +#if defined FSR2_BIND_UAV_EXPOSURE + layout (set = 1, binding = FSR2_BIND_UAV_EXPOSURE, rg32f) uniform image2D rw_exposure; +#endif +#if defined FSR2_BIND_UAV_AUTO_EXPOSURE + layout(set = 1, binding = FSR2_BIND_UAV_AUTO_EXPOSURE, rg32f) uniform image2D rw_auto_exposure; +#endif +#if defined FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC + layout (set = 1, binding = FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC, r32ui) coherent uniform uimage2D rw_spd_global_atomic; +#endif + +#if defined FSR2_BIND_UAV_AUTOREACTIVE + layout(set = 1, binding = FSR2_BIND_UAV_AUTOREACTIVE, r32f) uniform image2D rw_output_autoreactive; +#endif +#if defined FSR2_BIND_UAV_AUTOCOMPOSITION + layout(set = 1, binding = FSR2_BIND_UAV_AUTOCOMPOSITION, r32f) uniform image2D rw_output_autocomposition; +#endif +#if defined FSR2_BIND_UAV_PREV_PRE_ALPHA_COLOR + layout(set = 1, binding = FSR2_BIND_UAV_PREV_PRE_ALPHA_COLOR, r11f_g11f_b10f) uniform image2D rw_output_prev_color_pre_alpha; +#endif +#if defined FSR2_BIND_UAV_PREV_POST_ALPHA_COLOR + layout(set = 1, binding = FSR2_BIND_UAV_PREV_POST_ALPHA_COLOR, r11f_g11f_b10f) uniform image2D rw_output_prev_color_post_alpha; +#endif + +#if defined(FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS) +FfxFloat32 LoadMipLuma(FfxInt32x2 iPxPos, FfxInt32 mipLevel) +{ + return texelFetch(r_imgMips, iPxPos, FfxInt32(mipLevel)).r; +} +#endif + +#if defined(FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS) +FfxFloat32 SampleMipLuma(FfxFloat32x2 fUV, FfxInt32 mipLevel) +{ + return textureLod(sampler2D(r_imgMips, s_LinearClamp), fUV, FfxFloat32(mipLevel)).r; +} +#endif + +#if defined(FSR2_BIND_SRV_INPUT_DEPTH) +FfxFloat32 LoadInputDepth(FfxInt32x2 iPxPos) +{ + return texelFetch(r_input_depth, iPxPos, 0).r; +} +#endif + +#if defined(FSR2_BIND_SRV_REACTIVE_MASK) +FfxFloat32 LoadReactiveMask(FfxInt32x2 iPxPos) +{ +// -- GODOT start -- +#if FFX_FSR2_OPTION_GODOT_REACTIVE_MASK_CLAMP + return min(texelFetch(r_reactive_mask, FfxInt32x2(iPxPos), 0).r, 0.9f); +#else + return texelFetch(r_reactive_mask, FfxInt32x2(iPxPos), 0).r; +#endif +// -- GODOT end -- +} +#endif + +#if defined(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK) +FfxFloat32 LoadTransparencyAndCompositionMask(FfxUInt32x2 iPxPos) +{ + return texelFetch(r_transparency_and_composition_mask, FfxInt32x2(iPxPos), 0).r; +} +#endif + +#if defined(FSR2_BIND_SRV_INPUT_COLOR) +FfxFloat32x3 LoadInputColor(FfxInt32x2 iPxPos) +{ + return texelFetch(r_input_color_jittered, iPxPos, 0).rgb; +} +#endif + +#if defined(FSR2_BIND_SRV_INPUT_COLOR) +FfxFloat32x3 SampleInputColor(FfxFloat32x2 fUV) +{ + return textureLod(sampler2D(r_input_color_jittered, s_LinearClamp), fUV, 0.0f).rgb; +} +#endif + +#if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR) +FfxFloat32x3 LoadPreparedInputColor(FfxInt32x2 iPxPos) +{ + return texelFetch(r_prepared_input_color, iPxPos, 0).xyz; +} +#endif + +#if defined(FSR2_BIND_SRV_INPUT_MOTION_VECTORS) +FfxFloat32x2 LoadInputMotionVector(FfxInt32x2 iPxDilatedMotionVectorPos) +{ + FfxFloat32x2 fSrcMotionVector = texelFetch(r_input_motion_vectors, iPxDilatedMotionVectorPos, 0).xy; + +// -- GODOT start -- +#if FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS + bool bInvalidMotionVector = all(lessThanEqual(fSrcMotionVector, vec2(-1.0f, -1.0f))); + if (bInvalidMotionVector) + { + FfxFloat32 fSrcDepth = LoadInputDepth(iPxDilatedMotionVectorPos); + FfxFloat32x2 fUv = (iPxDilatedMotionVectorPos + FfxFloat32(0.5)) / RenderSize(); + fSrcMotionVector = FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS_FUNCTION(fUv, fSrcDepth, cbFSR2.mReprojectionMatrix); + } +#endif +// -- GODOT end -- + + FfxFloat32x2 fUvMotionVector = fSrcMotionVector * MotionVectorScale(); + +#if FFX_FSR2_OPTION_JITTERED_MOTION_VECTORS + fUvMotionVector -= MotionVectorJitterCancellation(); +#endif + + return fUvMotionVector; +} +#endif + +#if defined(FSR2_BIND_SRV_INTERNAL_UPSCALED) +FfxFloat32x4 LoadHistory(FfxInt32x2 iPxHistory) +{ + return texelFetch(r_internal_upscaled_color, iPxHistory, 0); +} +#endif + +#if defined(FSR2_BIND_UAV_LUMA_HISTORY) +void StoreLumaHistory(FfxInt32x2 iPxPos, FfxFloat32x4 fLumaHistory) +{ + imageStore(rw_luma_history, FfxInt32x2(iPxPos), fLumaHistory); +} +#endif + +#if defined(FSR2_BIND_SRV_LUMA_HISTORY) +FfxFloat32x4 SampleLumaHistory(FfxFloat32x2 fUV) +{ + return textureLod(sampler2D(r_luma_history, s_LinearClamp), fUV, 0.0f); +} +#endif + +#if defined(FSR2_BIND_UAV_INTERNAL_UPSCALED) +void StoreReprojectedHistory(FfxInt32x2 iPxHistory, FfxFloat32x4 fHistory) +{ + imageStore(rw_internal_upscaled_color, iPxHistory, fHistory); +} +#endif + +#if defined(FSR2_BIND_UAV_INTERNAL_UPSCALED) +void StoreInternalColorAndWeight(FfxInt32x2 iPxPos, FfxFloat32x4 fColorAndWeight) +{ + imageStore(rw_internal_upscaled_color, FfxInt32x2(iPxPos), fColorAndWeight); +} +#endif + +#if defined(FSR2_BIND_UAV_UPSCALED_OUTPUT) +void StoreUpscaledOutput(FfxInt32x2 iPxPos, FfxFloat32x3 fColor) +{ + imageStore(rw_upscaled_output, FfxInt32x2(iPxPos), FfxFloat32x4(fColor, 1.f)); +} +#endif + +#if defined(FSR2_BIND_SRV_LOCK_STATUS) +FfxFloat32x2 LoadLockStatus(FfxInt32x2 iPxPos) +{ + FfxFloat32x2 fLockStatus = texelFetch(r_lock_status, iPxPos, 0).rg; + + return fLockStatus; +} +#endif + +#if defined(FSR2_BIND_UAV_LOCK_STATUS) +void StoreLockStatus(FfxInt32x2 iPxPos, FfxFloat32x2 fLockstatus) +{ + imageStore(rw_lock_status, iPxPos, vec4(fLockstatus, 0.0f, 0.0f)); +} +#endif + +#if defined(FSR2_BIND_SRV_LOCK_INPUT_LUMA) +FfxFloat32 LoadLockInputLuma(FfxInt32x2 iPxPos) +{ + return texelFetch(r_lock_input_luma, iPxPos, 0).r; +} +#endif + +#if defined(FSR2_BIND_UAV_LOCK_INPUT_LUMA) +void StoreLockInputLuma(FfxInt32x2 iPxPos, FfxFloat32 fLuma) +{ + imageStore(rw_lock_input_luma, iPxPos, vec4(fLuma, 0, 0, 0)); +} +#endif + +#if defined(FSR2_BIND_SRV_NEW_LOCKS) +FfxFloat32 LoadNewLocks(FfxInt32x2 iPxPos) +{ + return texelFetch(r_new_locks, iPxPos, 0).r; +} +#endif + +#if defined(FSR2_BIND_UAV_NEW_LOCKS) +FfxFloat32 LoadRwNewLocks(FfxInt32x2 iPxPos) +{ + return imageLoad(rw_new_locks, iPxPos).r; +} +#endif + +#if defined(FSR2_BIND_UAV_NEW_LOCKS) +void StoreNewLocks(FfxInt32x2 iPxPos, FfxFloat32 newLock) +{ + imageStore(rw_new_locks, iPxPos, vec4(newLock, 0, 0, 0)); +} +#endif + +#if defined(FSR2_BIND_UAV_PREPARED_INPUT_COLOR) +void StorePreparedInputColor(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x4 fTonemapped) +{ + imageStore(rw_prepared_input_color, iPxPos, fTonemapped); +} +#endif + +#if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR) +FfxFloat32 SampleDepthClip(FfxFloat32x2 fUV) +{ + return textureLod(sampler2D(r_prepared_input_color, s_LinearClamp), fUV, 0.0f).w; +} +#endif + +#if defined(FSR2_BIND_SRV_LOCK_STATUS) +FfxFloat32x2 SampleLockStatus(FfxFloat32x2 fUV) +{ + FfxFloat32x2 fLockStatus = textureLod(sampler2D(r_lock_status, s_LinearClamp), fUV, 0.0f).rg; + return fLockStatus; +} +#endif + +#if defined(FSR2_BIND_SRV_DEPTH) +FfxFloat32 LoadSceneDepth(FfxInt32x2 iPxInput) +{ + return texelFetch(r_input_depth, iPxInput, 0).r; +} +#endif + +#if defined(FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH) +FfxFloat32 LoadReconstructedPrevDepth(FfxInt32x2 iPxPos) +{ + return uintBitsToFloat(texelFetch(r_reconstructed_previous_nearest_depth, iPxPos, 0).r); +} +#endif + +#if defined(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH) +void StoreReconstructedDepth(FfxInt32x2 iPxSample, FfxFloat32 fDepth) +{ + FfxUInt32 uDepth = floatBitsToUint(fDepth); + + #if FFX_FSR2_OPTION_INVERTED_DEPTH + imageAtomicMax(rw_reconstructed_previous_nearest_depth, iPxSample, uDepth); + #else + imageAtomicMin(rw_reconstructed_previous_nearest_depth, iPxSample, uDepth); // min for standard, max for inverted depth + #endif +} +#endif + +#if defined(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH) +void SetReconstructedDepth(FfxInt32x2 iPxSample, FfxUInt32 uValue) +{ + imageStore(rw_reconstructed_previous_nearest_depth, iPxSample, uvec4(uValue, 0, 0, 0)); +} +#endif + +#if defined(FSR2_BIND_UAV_DILATED_DEPTH) +void StoreDilatedDepth(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32 fDepth) +{ + //FfxUInt32 uDepth = f32tof16(fDepth); + imageStore(rw_dilatedDepth, iPxPos, vec4(fDepth, 0.0f, 0.0f, 0.0f)); +} +#endif + +#if defined(FSR2_BIND_UAV_DILATED_MOTION_VECTORS) +void StoreDilatedMotionVector(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x2 fMotionVector) +{ + imageStore(rw_dilated_motion_vectors, iPxPos, vec4(fMotionVector, 0.0f, 0.0f)); +} +#endif + +#if defined(FSR2_BIND_SRV_DILATED_MOTION_VECTORS) +FfxFloat32x2 LoadDilatedMotionVector(FfxInt32x2 iPxInput) +{ + return texelFetch(r_dilated_motion_vectors, iPxInput, 0).rg; +} +#endif + +#if defined(FSR2_BIND_SRV_DILATED_MOTION_VECTORS) +FfxFloat32x2 SampleDilatedMotionVector(FfxFloat32x2 fUV) +{ + return textureLod(sampler2D(r_dilated_motion_vectors, s_LinearClamp), fUV, 0.0f).rg; +} +#endif + +#if defined(FSR2_BIND_SRV_PREVIOUS_DILATED_MOTION_VECTORS) +FfxFloat32x2 LoadPreviousDilatedMotionVector(FfxInt32x2 iPxInput) +{ + return texelFetch(r_previous_dilated_motion_vectors, iPxInput, 0).rg; +} + +FfxFloat32x2 SamplePreviousDilatedMotionVector(FfxFloat32x2 fUV) +{ + return textureLod(sampler2D(r_previous_dilated_motion_vectors, s_LinearClamp), fUV, 0.0f).xy; +} +#endif + +#if defined(FSR2_BIND_SRV_DILATED_DEPTH) +FfxFloat32 LoadDilatedDepth(FfxInt32x2 iPxInput) +{ + return texelFetch(r_dilatedDepth, iPxInput, 0).r; +} +#endif + +#if defined(FSR2_BIND_SRV_INPUT_EXPOSURE) +FfxFloat32 Exposure() +{ + FfxFloat32 exposure = texelFetch(r_input_exposure, FfxInt32x2(0, 0), 0).x; + + if (exposure == 0.0f) { + exposure = 1.0f; + } + + return exposure; +} +#endif + +#if defined(FSR2_BIND_SRV_AUTO_EXPOSURE) +FfxFloat32 AutoExposure() +{ + FfxFloat32 exposure = texelFetch(r_auto_exposure, FfxInt32x2(0, 0), 0).x; + + if (exposure == 0.0f) { + exposure = 1.0f; + } + + return exposure; +} +#endif + +FfxFloat32 SampleLanczos2Weight(FfxFloat32 x) +{ +#if defined(FSR2_BIND_SRV_LANCZOS_LUT) + return textureLod(sampler2D(r_lanczos_lut, s_LinearClamp), FfxFloat32x2(x / 2.0f, 0.5f), 0.0f).x; +#else + return 0.f; +#endif +} + +#if defined(FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT) +FfxFloat32 SampleUpsampleMaximumBias(FfxFloat32x2 uv) +{ + // Stored as a SNORM, so make sure to multiply by 2 to retrieve the actual expected range. + return FfxFloat32(2.0f) * FfxFloat32(textureLod(sampler2D(r_upsample_maximum_bias_lut, s_LinearClamp), abs(uv) * 2.0f, 0.0f).r); +} +#endif + +#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS) +FfxFloat32x2 SampleDilatedReactiveMasks(FfxFloat32x2 fUV) +{ + return textureLod(sampler2D(r_dilated_reactive_masks, s_LinearClamp), fUV, 0.0f).rg; +} +#endif + +#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS) +FfxFloat32x2 LoadDilatedReactiveMasks(FFX_PARAMETER_IN FfxInt32x2 iPxPos) +{ + return texelFetch(r_dilated_reactive_masks, iPxPos, 0).rg; +} +#endif + +#if defined(FSR2_BIND_UAV_DILATED_REACTIVE_MASKS) +void StoreDilatedReactiveMasks(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x2 fDilatedReactiveMasks) +{ + imageStore(rw_dilated_reactive_masks, iPxPos, vec4(fDilatedReactiveMasks, 0.0f, 0.0f)); +} +#endif + +#if defined(FFX_INTERNAL) +FfxFloat32x4 SampleDebug(FfxFloat32x2 fUV) +{ + return textureLod(sampler2D(r_debug_out, s_LinearClamp), fUV, 0.0f).rgba; +} +#endif + +#if defined(FSR2_BIND_SRV_INPUT_OPAQUE_ONLY) +FfxFloat32x3 LoadOpaqueOnly(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos) +{ + return texelFetch(r_input_opaque_only, iPxPos, 0).xyz; +} +#endif + +#if defined(FSR2_BIND_SRV_PREV_PRE_ALPHA_COLOR) +FfxFloat32x3 LoadPrevPreAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos) +{ + return texelFetch(r_input_prev_color_pre_alpha, iPxPos, 0).xyz; +} +#endif + +#if defined(FSR2_BIND_SRV_PREV_POST_ALPHA_COLOR) +FfxFloat32x3 LoadPrevPostAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos) +{ + return texelFetch(r_input_prev_color_post_alpha, iPxPos, 0).xyz; +} +#endif + +#if defined(FSR2_BIND_UAV_AUTOREACTIVE) +#if defined(FSR2_BIND_UAV_AUTOCOMPOSITION) +void StoreAutoReactive(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_F2 fReactive) +{ + imageStore(rw_output_autoreactive, iPxPos, vec4(FfxFloat32(fReactive.x), 0.0f, 0.0f, 0.0f)); + + imageStore(rw_output_autocomposition, iPxPos, vec4(FfxFloat32(fReactive.y), 0.0f, 0.0f, 0.0f)); +} +#endif +#endif + +#if defined(FSR2_BIND_UAV_PREV_PRE_ALPHA_COLOR) +void StorePrevPreAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_F3 color) +{ + imageStore(rw_output_prev_color_pre_alpha, iPxPos, vec4(color, 0.0f)); +} +#endif + +#if defined(FSR2_BIND_UAV_PREV_POST_ALPHA_COLOR) +void StorePrevPostAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_F3 color) +{ + imageStore(rw_output_prev_color_post_alpha, iPxPos, vec4(color, 0.0f)); +} +#endif + +#endif // #if defined(FFX_GPU) diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_hlsl.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_hlsl.h new file mode 100644 index 00000000000..fd722b307e2 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_hlsl.h @@ -0,0 +1,799 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "ffx_fsr2_resources.h" + +#if defined(FFX_GPU) +#ifdef __hlsl_dx_compiler +#pragma dxc diagnostic push +#pragma dxc diagnostic ignored "-Wambig-lit-shift" +#endif //__hlsl_dx_compiler +#include "ffx_core.h" +#ifdef __hlsl_dx_compiler +#pragma dxc diagnostic pop +#endif //__hlsl_dx_compiler +#endif // #if defined(FFX_GPU) + +#if defined(FFX_GPU) +#ifndef FFX_FSR2_PREFER_WAVE64 +#define FFX_FSR2_PREFER_WAVE64 +#endif // #if defined(FFX_GPU) + +#if defined(FFX_GPU) +#pragma warning(disable: 3205) // conversion from larger type to smaller +#endif // #if defined(FFX_GPU) + +#define DECLARE_SRV_REGISTER(regIndex) t##regIndex +#define DECLARE_UAV_REGISTER(regIndex) u##regIndex +#define DECLARE_CB_REGISTER(regIndex) b##regIndex +#define FFX_FSR2_DECLARE_SRV(regIndex) register(DECLARE_SRV_REGISTER(regIndex)) +#define FFX_FSR2_DECLARE_UAV(regIndex) register(DECLARE_UAV_REGISTER(regIndex)) +#define FFX_FSR2_DECLARE_CB(regIndex) register(DECLARE_CB_REGISTER(regIndex)) + +#if defined(FSR2_BIND_CB_FSR2) || defined(FFX_INTERNAL) + cbuffer cbFSR2 : FFX_FSR2_DECLARE_CB(FSR2_BIND_CB_FSR2) + { + FfxInt32x2 iRenderSize; + FfxInt32x2 iMaxRenderSize; + FfxInt32x2 iDisplaySize; + FfxInt32x2 iInputColorResourceDimensions; + FfxInt32x2 iLumaMipDimensions; + FfxInt32 iLumaMipLevelToUse; + FfxInt32 iFrameIndex; + + FfxFloat32x4 fDeviceToViewDepth; + FfxFloat32x2 fJitter; + FfxFloat32x2 fMotionVectorScale; + FfxFloat32x2 fDownscaleFactor; + FfxFloat32x2 fMotionVectorJitterCancellation; + FfxFloat32 fPreExposure; + FfxFloat32 fPreviousFramePreExposure; + FfxFloat32 fTanHalfFOV; + FfxFloat32 fJitterSequenceLength; + FfxFloat32 fDeltaTime; + FfxFloat32 fDynamicResChangeFactor; + FfxFloat32 fViewSpaceToMetersFactor; + }; + +#define FFX_FSR2_CONSTANT_BUFFER_1_SIZE (sizeof(cbFSR2) / 4) // Number of 32-bit values. This must be kept in sync with the cbFSR2 size. +#endif + +#if defined(FFX_GPU) +#define FFX_FSR2_ROOTSIG_STRINGIFY(p) FFX_FSR2_ROOTSIG_STR(p) +#define FFX_FSR2_ROOTSIG_STR(p) #p +#define FFX_FSR2_ROOTSIG [RootSignature( "DescriptorTable(UAV(u0, numDescriptors = " FFX_FSR2_ROOTSIG_STRINGIFY(FFX_FSR2_RESOURCE_IDENTIFIER_COUNT) ")), " \ + "DescriptorTable(SRV(t0, numDescriptors = " FFX_FSR2_ROOTSIG_STRINGIFY(FFX_FSR2_RESOURCE_IDENTIFIER_COUNT) ")), " \ + "RootConstants(num32BitConstants=" FFX_FSR2_ROOTSIG_STRINGIFY(FFX_FSR2_CONSTANT_BUFFER_1_SIZE) ", b0), " \ + "StaticSampler(s0, filter = FILTER_MIN_MAG_MIP_POINT, " \ + "addressU = TEXTURE_ADDRESS_CLAMP, " \ + "addressV = TEXTURE_ADDRESS_CLAMP, " \ + "addressW = TEXTURE_ADDRESS_CLAMP, " \ + "comparisonFunc = COMPARISON_NEVER, " \ + "borderColor = STATIC_BORDER_COLOR_TRANSPARENT_BLACK), " \ + "StaticSampler(s1, filter = FILTER_MIN_MAG_MIP_LINEAR, " \ + "addressU = TEXTURE_ADDRESS_CLAMP, " \ + "addressV = TEXTURE_ADDRESS_CLAMP, " \ + "addressW = TEXTURE_ADDRESS_CLAMP, " \ + "comparisonFunc = COMPARISON_NEVER, " \ + "borderColor = STATIC_BORDER_COLOR_TRANSPARENT_BLACK)" )] + +#define FFX_FSR2_CONSTANT_BUFFER_2_SIZE 6 // Number of 32-bit values. This must be kept in sync with max( cbRCAS , cbSPD) size. + +#define FFX_FSR2_CB2_ROOTSIG [RootSignature( "DescriptorTable(UAV(u0, numDescriptors = " FFX_FSR2_ROOTSIG_STRINGIFY(FFX_FSR2_RESOURCE_IDENTIFIER_COUNT) ")), " \ + "DescriptorTable(SRV(t0, numDescriptors = " FFX_FSR2_ROOTSIG_STRINGIFY(FFX_FSR2_RESOURCE_IDENTIFIER_COUNT) ")), " \ + "RootConstants(num32BitConstants=" FFX_FSR2_ROOTSIG_STRINGIFY(FFX_FSR2_CONSTANT_BUFFER_1_SIZE) ", b0), " \ + "RootConstants(num32BitConstants=" FFX_FSR2_ROOTSIG_STRINGIFY(FFX_FSR2_CONSTANT_BUFFER_2_SIZE) ", b1), " \ + "StaticSampler(s0, filter = FILTER_MIN_MAG_MIP_POINT, " \ + "addressU = TEXTURE_ADDRESS_CLAMP, " \ + "addressV = TEXTURE_ADDRESS_CLAMP, " \ + "addressW = TEXTURE_ADDRESS_CLAMP, " \ + "comparisonFunc = COMPARISON_NEVER, " \ + "borderColor = STATIC_BORDER_COLOR_TRANSPARENT_BLACK), " \ + "StaticSampler(s1, filter = FILTER_MIN_MAG_MIP_LINEAR, " \ + "addressU = TEXTURE_ADDRESS_CLAMP, " \ + "addressV = TEXTURE_ADDRESS_CLAMP, " \ + "addressW = TEXTURE_ADDRESS_CLAMP, " \ + "comparisonFunc = COMPARISON_NEVER, " \ + "borderColor = STATIC_BORDER_COLOR_TRANSPARENT_BLACK)" )] +#if defined(FFX_FSR2_EMBED_ROOTSIG) +#define FFX_FSR2_EMBED_ROOTSIG_CONTENT FFX_FSR2_ROOTSIG +#define FFX_FSR2_EMBED_CB2_ROOTSIG_CONTENT FFX_FSR2_CB2_ROOTSIG +#else +#define FFX_FSR2_EMBED_ROOTSIG_CONTENT +#define FFX_FSR2_EMBED_CB2_ROOTSIG_CONTENT +#endif // #if FFX_FSR2_EMBED_ROOTSIG +#endif // #if defined(FFX_GPU) + +/* Define getter functions in the order they are defined in the CB! */ +FfxInt32x2 RenderSize() +{ + return iRenderSize; +} + +FfxInt32x2 MaxRenderSize() +{ + return iMaxRenderSize; +} + +FfxInt32x2 DisplaySize() +{ + return iDisplaySize; +} + +FfxInt32x2 InputColorResourceDimensions() +{ + return iInputColorResourceDimensions; +} + +FfxInt32x2 LumaMipDimensions() +{ + return iLumaMipDimensions; +} + +FfxInt32 LumaMipLevelToUse() +{ + return iLumaMipLevelToUse; +} + +FfxInt32 FrameIndex() +{ + return iFrameIndex; +} + +FfxFloat32x2 Jitter() +{ + return fJitter; +} + +FfxFloat32x4 DeviceToViewSpaceTransformFactors() +{ + return fDeviceToViewDepth; +} + +FfxFloat32x2 MotionVectorScale() +{ + return fMotionVectorScale; +} + +FfxFloat32x2 DownscaleFactor() +{ + return fDownscaleFactor; +} + +FfxFloat32x2 MotionVectorJitterCancellation() +{ + return fMotionVectorJitterCancellation; +} + +FfxFloat32 PreExposure() +{ + return fPreExposure; +} + +FfxFloat32 PreviousFramePreExposure() +{ + return fPreviousFramePreExposure; +} + +FfxFloat32 TanHalfFoV() +{ + return fTanHalfFOV; +} + +FfxFloat32 JitterSequenceLength() +{ + return fJitterSequenceLength; +} + +FfxFloat32 DeltaTime() +{ + return fDeltaTime; +} + +FfxFloat32 DynamicResChangeFactor() +{ + return fDynamicResChangeFactor; +} + +FfxFloat32 ViewSpaceToMetersFactor() +{ + return fViewSpaceToMetersFactor; +} + + +SamplerState s_PointClamp : register(s0); +SamplerState s_LinearClamp : register(s1); + +// SRVs +#if defined(FFX_INTERNAL) + Texture2D r_input_opaque_only : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_OPAQUE_ONLY); + Texture2D r_input_color_jittered : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_COLOR); + Texture2D r_input_motion_vectors : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_MOTION_VECTORS); + Texture2D r_input_depth : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_DEPTH); + Texture2D r_input_exposure : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_EXPOSURE); + Texture2D r_auto_exposure : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE); + Texture2D r_reactive_mask : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_REACTIVE_MASK); + Texture2D r_transparency_and_composition_mask : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_TRANSPARENCY_AND_COMPOSITION_MASK); + Texture2D r_reconstructed_previous_nearest_depth : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH); + Texture2D r_dilated_motion_vectors : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS); + Texture2D r_previous_dilated_motion_vectors : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_PREVIOUS_DILATED_MOTION_VECTORS); + Texture2D r_dilatedDepth : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH); + Texture2D r_internal_upscaled_color : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR); + Texture2D r_lock_status : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS); + Texture2D r_lock_input_luma : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_INPUT_LUMA); + Texture2D r_new_locks : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_NEW_LOCKS); + Texture2D r_prepared_input_color : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR); + Texture2D r_luma_history : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY); + Texture2D r_rcas_input : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_RCAS_INPUT); + Texture2D r_lanczos_lut : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_LANCZOS_LUT); + Texture2D r_imgMips : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE); + Texture2D r_upsample_maximum_bias_lut : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTITIER_UPSAMPLE_MAXIMUM_BIAS_LUT); + Texture2D r_dilated_reactive_masks : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS); + Texture2D r_input_prev_color_pre_alpha : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR); + Texture2D r_input_prev_color_post_alpha : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR); + + Texture2D r_debug_out : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DEBUG_OUTPUT); + + // UAV declarations + RWTexture2D rw_reconstructed_previous_nearest_depth : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH); + RWTexture2D rw_dilated_motion_vectors : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS); + RWTexture2D rw_dilatedDepth : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH); + RWTexture2D rw_internal_upscaled_color : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR); + RWTexture2D rw_lock_status : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS); + RWTexture2D rw_lock_input_luma : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_INPUT_LUMA); + RWTexture2D rw_new_locks : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_NEW_LOCKS); + RWTexture2D rw_prepared_input_color : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR); + RWTexture2D rw_luma_history : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY); + RWTexture2D rw_upscaled_output : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_UPSCALED_OUTPUT); + + globallycoherent RWTexture2D rw_img_mip_shading_change : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_SHADING_CHANGE); + globallycoherent RWTexture2D rw_img_mip_5 : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_5); + RWTexture2D rw_dilated_reactive_masks : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS); + RWTexture2D rw_auto_exposure : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE); + globallycoherent RWTexture2D rw_spd_global_atomic : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_SPD_ATOMIC_COUNT); + RWTexture2D rw_debug_out : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DEBUG_OUTPUT); + + RWTexture2D rw_output_autoreactive : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTOREACTIVE); + RWTexture2D rw_output_autocomposition : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTOCOMPOSITION); + RWTexture2D rw_output_prev_color_pre_alpha : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR); + RWTexture2D rw_output_prev_color_post_alpha : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR); + +#else // #if defined(FFX_INTERNAL) + #if defined FSR2_BIND_SRV_INPUT_COLOR + Texture2D r_input_color_jittered : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INPUT_COLOR); + #endif + #if defined FSR2_BIND_SRV_INPUT_OPAQUE_ONLY + Texture2D r_input_opaque_only : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INPUT_OPAQUE_ONLY); + #endif + #if defined FSR2_BIND_SRV_INPUT_MOTION_VECTORS + Texture2D r_input_motion_vectors : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INPUT_MOTION_VECTORS); + #endif + #if defined FSR2_BIND_SRV_INPUT_DEPTH + Texture2D r_input_depth : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INPUT_DEPTH); + #endif + #if defined FSR2_BIND_SRV_INPUT_EXPOSURE + Texture2D r_input_exposure : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INPUT_EXPOSURE); + #endif + #if defined FSR2_BIND_SRV_AUTO_EXPOSURE + Texture2D r_auto_exposure : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_AUTO_EXPOSURE); + #endif + #if defined FSR2_BIND_SRV_REACTIVE_MASK + Texture2D r_reactive_mask : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_REACTIVE_MASK); + #endif + #if defined FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK + Texture2D r_transparency_and_composition_mask : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK); + #endif + #if defined FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH + Texture2D r_reconstructed_previous_nearest_depth : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH); + #endif + #if defined FSR2_BIND_SRV_DILATED_MOTION_VECTORS + Texture2D r_dilated_motion_vectors : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DILATED_MOTION_VECTORS); + #endif + #if defined FSR2_BIND_SRV_PREVIOUS_DILATED_MOTION_VECTORS + Texture2D r_previous_dilated_motion_vectors : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_PREVIOUS_DILATED_MOTION_VECTORS); + #endif + #if defined FSR2_BIND_SRV_DILATED_DEPTH + Texture2D r_dilatedDepth : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DILATED_DEPTH); + #endif + #if defined FSR2_BIND_SRV_INTERNAL_UPSCALED + Texture2D r_internal_upscaled_color : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INTERNAL_UPSCALED); + #endif + #if defined FSR2_BIND_SRV_LOCK_STATUS + Texture2D r_lock_status : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LOCK_STATUS); + #endif + #if defined FSR2_BIND_SRV_LOCK_INPUT_LUMA + Texture2D r_lock_input_luma : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LOCK_INPUT_LUMA); + #endif + #if defined FSR2_BIND_SRV_NEW_LOCKS + Texture2D r_new_locks : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_NEW_LOCKS); + #endif + #if defined FSR2_BIND_SRV_PREPARED_INPUT_COLOR + Texture2D r_prepared_input_color : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_PREPARED_INPUT_COLOR); + #endif + #if defined FSR2_BIND_SRV_LUMA_HISTORY + Texture2D r_luma_history : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LUMA_HISTORY); + #endif + #if defined FSR2_BIND_SRV_RCAS_INPUT + Texture2D r_rcas_input : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_RCAS_INPUT); + #endif + #if defined FSR2_BIND_SRV_LANCZOS_LUT + Texture2D r_lanczos_lut : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LANCZOS_LUT); + #endif + #if defined FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS + Texture2D r_imgMips : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS); + #endif + #if defined FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT + Texture2D r_upsample_maximum_bias_lut : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT); + #endif + #if defined FSR2_BIND_SRV_DILATED_REACTIVE_MASKS + Texture2D r_dilated_reactive_masks : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS); + #endif + + #if defined FSR2_BIND_SRV_PREV_PRE_ALPHA_COLOR + Texture2D r_input_prev_color_pre_alpha : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR); + #endif + #if defined FSR2_BIND_SRV_PREV_POST_ALPHA_COLOR + Texture2D r_input_prev_color_post_alpha : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR); + #endif + + // UAV declarations + #if defined FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH + RWTexture2D rw_reconstructed_previous_nearest_depth : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH); + #endif + #if defined FSR2_BIND_UAV_DILATED_MOTION_VECTORS + RWTexture2D rw_dilated_motion_vectors : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_DILATED_MOTION_VECTORS); + #endif + #if defined FSR2_BIND_UAV_DILATED_DEPTH + RWTexture2D rw_dilatedDepth : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_DILATED_DEPTH); + #endif + #if defined FSR2_BIND_UAV_INTERNAL_UPSCALED + RWTexture2D rw_internal_upscaled_color : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_INTERNAL_UPSCALED); + #endif + #if defined FSR2_BIND_UAV_LOCK_STATUS + RWTexture2D rw_lock_status : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_LOCK_STATUS); + #endif + #if defined FSR2_BIND_UAV_LOCK_INPUT_LUMA + RWTexture2D rw_lock_input_luma : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_LOCK_INPUT_LUMA); + #endif + #if defined FSR2_BIND_UAV_NEW_LOCKS + RWTexture2D rw_new_locks : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_NEW_LOCKS); + #endif + #if defined FSR2_BIND_UAV_PREPARED_INPUT_COLOR + RWTexture2D rw_prepared_input_color : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_PREPARED_INPUT_COLOR); + #endif + #if defined FSR2_BIND_UAV_LUMA_HISTORY + RWTexture2D rw_luma_history : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_LUMA_HISTORY); + #endif + #if defined FSR2_BIND_UAV_UPSCALED_OUTPUT + RWTexture2D rw_upscaled_output : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_UPSCALED_OUTPUT); + #endif + #if defined FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE + globallycoherent RWTexture2D rw_img_mip_shading_change : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE); + #endif + #if defined FSR2_BIND_UAV_EXPOSURE_MIP_5 + globallycoherent RWTexture2D rw_img_mip_5 : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_EXPOSURE_MIP_5); + #endif + #if defined FSR2_BIND_UAV_DILATED_REACTIVE_MASKS + RWTexture2D rw_dilated_reactive_masks : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_DILATED_REACTIVE_MASKS); + #endif + #if defined FSR2_BIND_UAV_EXPOSURE + RWTexture2D rw_exposure : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_EXPOSURE); + #endif + #if defined FSR2_BIND_UAV_AUTO_EXPOSURE + RWTexture2D rw_auto_exposure : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_AUTO_EXPOSURE); + #endif + #if defined FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC + globallycoherent RWTexture2D rw_spd_global_atomic : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC); + #endif + + #if defined FSR2_BIND_UAV_AUTOREACTIVE + RWTexture2D rw_output_autoreactive : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_AUTOREACTIVE); + #endif + #if defined FSR2_BIND_UAV_AUTOCOMPOSITION + RWTexture2D rw_output_autocomposition : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_AUTOCOMPOSITION); + #endif + #if defined FSR2_BIND_UAV_PREV_PRE_ALPHA_COLOR + RWTexture2D rw_output_prev_color_pre_alpha : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_PREV_PRE_ALPHA_COLOR); + #endif + #if defined FSR2_BIND_UAV_PREV_POST_ALPHA_COLOR + RWTexture2D rw_output_prev_color_post_alpha : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_PREV_POST_ALPHA_COLOR); + #endif +#endif // #if defined(FFX_INTERNAL) + +#if defined(FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS) || defined(FFX_INTERNAL) +FfxFloat32 LoadMipLuma(FfxUInt32x2 iPxPos, FfxUInt32 mipLevel) +{ + return r_imgMips.mips[mipLevel][iPxPos]; +} +#endif + +#if defined(FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS) || defined(FFX_INTERNAL) +FfxFloat32 SampleMipLuma(FfxFloat32x2 fUV, FfxUInt32 mipLevel) +{ + return r_imgMips.SampleLevel(s_LinearClamp, fUV, mipLevel); +} +#endif + +#if defined(FSR2_BIND_SRV_INPUT_DEPTH) || defined(FFX_INTERNAL) +FfxFloat32 LoadInputDepth(FfxUInt32x2 iPxPos) +{ + return r_input_depth[iPxPos]; +} +#endif + +#if defined(FSR2_BIND_SRV_INPUT_DEPTH) || defined(FFX_INTERNAL) +FfxFloat32 SampleInputDepth(FfxFloat32x2 fUV) +{ + return r_input_depth.SampleLevel(s_LinearClamp, fUV, 0).x; +} +#endif + +#if defined(FSR2_BIND_SRV_REACTIVE_MASK) || defined(FFX_INTERNAL) +FfxFloat32 LoadReactiveMask(FfxUInt32x2 iPxPos) +{ + return r_reactive_mask[iPxPos]; +} +#endif + +#if defined(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK) || defined(FFX_INTERNAL) +FfxFloat32 LoadTransparencyAndCompositionMask(FfxUInt32x2 iPxPos) +{ + return r_transparency_and_composition_mask[iPxPos]; +} +#endif + +#if defined(FSR2_BIND_SRV_INPUT_COLOR) || defined(FFX_INTERNAL) +FfxFloat32x3 LoadInputColor(FfxUInt32x2 iPxPos) +{ + return r_input_color_jittered[iPxPos].rgb; +} +#endif + +#if defined(FSR2_BIND_SRV_INPUT_COLOR) || defined(FFX_INTERNAL) +FfxFloat32x3 SampleInputColor(FfxFloat32x2 fUV) +{ + return r_input_color_jittered.SampleLevel(s_LinearClamp, fUV, 0).rgb; +} +#endif + +#if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR) || defined(FFX_INTERNAL) +FfxFloat32x3 LoadPreparedInputColor(FfxUInt32x2 iPxPos) +{ + return r_prepared_input_color[iPxPos].xyz; +} +#endif + +#if defined(FSR2_BIND_SRV_INPUT_MOTION_VECTORS) || defined(FFX_INTERNAL) +FfxFloat32x2 LoadInputMotionVector(FfxUInt32x2 iPxDilatedMotionVectorPos) +{ + FfxFloat32x2 fSrcMotionVector = r_input_motion_vectors[iPxDilatedMotionVectorPos].xy; + + FfxFloat32x2 fUvMotionVector = fSrcMotionVector * MotionVectorScale(); + +#if FFX_FSR2_OPTION_JITTERED_MOTION_VECTORS + fUvMotionVector -= MotionVectorJitterCancellation(); +#endif + + return fUvMotionVector; +} +#endif + +#if defined(FSR2_BIND_SRV_INTERNAL_UPSCALED) || defined(FFX_INTERNAL) +FfxFloat32x4 LoadHistory(FfxUInt32x2 iPxHistory) +{ + return r_internal_upscaled_color[iPxHistory]; +} +#endif + +#if defined(FSR2_BIND_UAV_LUMA_HISTORY) || defined(FFX_INTERNAL) +void StoreLumaHistory(FfxUInt32x2 iPxPos, FfxFloat32x4 fLumaHistory) +{ + rw_luma_history[iPxPos] = fLumaHistory; +} +#endif + +#if defined(FSR2_BIND_SRV_LUMA_HISTORY) || defined(FFX_INTERNAL) +FfxFloat32x4 SampleLumaHistory(FfxFloat32x2 fUV) +{ + return r_luma_history.SampleLevel(s_LinearClamp, fUV, 0); +} +#endif + +#if defined(FFX_INTERNAL) +FfxFloat32x4 SampleDebug(FfxFloat32x2 fUV) +{ + return r_debug_out.SampleLevel(s_LinearClamp, fUV, 0).w; +} +#endif + +#if defined(FSR2_BIND_UAV_INTERNAL_UPSCALED) || defined(FFX_INTERNAL) +void StoreReprojectedHistory(FfxUInt32x2 iPxHistory, FfxFloat32x4 fHistory) +{ + rw_internal_upscaled_color[iPxHistory] = fHistory; +} +#endif + +#if defined(FSR2_BIND_UAV_INTERNAL_UPSCALED) || defined(FFX_INTERNAL) +void StoreInternalColorAndWeight(FfxUInt32x2 iPxPos, FfxFloat32x4 fColorAndWeight) +{ + rw_internal_upscaled_color[iPxPos] = fColorAndWeight; +} +#endif + +#if defined(FSR2_BIND_UAV_UPSCALED_OUTPUT) || defined(FFX_INTERNAL) +void StoreUpscaledOutput(FfxUInt32x2 iPxPos, FfxFloat32x3 fColor) +{ + rw_upscaled_output[iPxPos] = FfxFloat32x4(fColor, 1.f); +} +#endif + +//LOCK_LIFETIME_REMAINING == 0 +//Should make LockInitialLifetime() return a const 1.0f later +#if defined(FSR2_BIND_SRV_LOCK_STATUS) || defined(FFX_INTERNAL) +FfxFloat32x2 LoadLockStatus(FfxUInt32x2 iPxPos) +{ + return r_lock_status[iPxPos]; +} +#endif + +#if defined(FSR2_BIND_UAV_LOCK_STATUS) || defined(FFX_INTERNAL) +void StoreLockStatus(FfxUInt32x2 iPxPos, FfxFloat32x2 fLockStatus) +{ + rw_lock_status[iPxPos] = fLockStatus; +} +#endif + +#if defined(FSR2_BIND_SRV_LOCK_INPUT_LUMA) || defined(FFX_INTERNAL) +FfxFloat32 LoadLockInputLuma(FfxUInt32x2 iPxPos) +{ + return r_lock_input_luma[iPxPos]; +} +#endif + +#if defined(FSR2_BIND_UAV_LOCK_INPUT_LUMA) || defined(FFX_INTERNAL) +void StoreLockInputLuma(FfxUInt32x2 iPxPos, FfxFloat32 fLuma) +{ + rw_lock_input_luma[iPxPos] = fLuma; +} +#endif + +#if defined(FSR2_BIND_SRV_NEW_LOCKS) || defined(FFX_INTERNAL) +FfxFloat32 LoadNewLocks(FfxUInt32x2 iPxPos) +{ + return r_new_locks[iPxPos]; +} +#endif + +#if defined(FSR2_BIND_UAV_NEW_LOCKS) || defined(FFX_INTERNAL) +FfxFloat32 LoadRwNewLocks(FfxUInt32x2 iPxPos) +{ + return rw_new_locks[iPxPos]; +} +#endif + +#if defined(FSR2_BIND_UAV_NEW_LOCKS) || defined(FFX_INTERNAL) +void StoreNewLocks(FfxUInt32x2 iPxPos, FfxFloat32 newLock) +{ + rw_new_locks[iPxPos] = newLock; +} +#endif + +#if defined(FSR2_BIND_UAV_PREPARED_INPUT_COLOR) || defined(FFX_INTERNAL) +void StorePreparedInputColor(FFX_PARAMETER_IN FfxUInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x4 fTonemapped) +{ + rw_prepared_input_color[iPxPos] = fTonemapped; +} +#endif + +#if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR) || defined(FFX_INTERNAL) +FfxFloat32 SampleDepthClip(FfxFloat32x2 fUV) +{ + return r_prepared_input_color.SampleLevel(s_LinearClamp, fUV, 0).w; +} +#endif + +#if defined(FSR2_BIND_SRV_LOCK_STATUS) || defined(FFX_INTERNAL) +FfxFloat32x2 SampleLockStatus(FfxFloat32x2 fUV) +{ + FfxFloat32x2 fLockStatus = r_lock_status.SampleLevel(s_LinearClamp, fUV, 0); + return fLockStatus; +} +#endif + +#if defined(FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH) || defined(FFX_INTERNAL) +FfxFloat32 LoadReconstructedPrevDepth(FfxUInt32x2 iPxPos) +{ + return asfloat(r_reconstructed_previous_nearest_depth[iPxPos]); +} +#endif + +#if defined(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH) || defined(FFX_INTERNAL) +void StoreReconstructedDepth(FfxUInt32x2 iPxSample, FfxFloat32 fDepth) +{ + FfxUInt32 uDepth = asuint(fDepth); + + #if FFX_FSR2_OPTION_INVERTED_DEPTH + InterlockedMax(rw_reconstructed_previous_nearest_depth[iPxSample], uDepth); + #else + InterlockedMin(rw_reconstructed_previous_nearest_depth[iPxSample], uDepth); // min for standard, max for inverted depth + #endif +} +#endif + +#if defined(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH) || defined(FFX_INTERNAL) +void SetReconstructedDepth(FfxUInt32x2 iPxSample, const FfxUInt32 uValue) +{ + rw_reconstructed_previous_nearest_depth[iPxSample] = uValue; +} +#endif + +#if defined(FSR2_BIND_UAV_DILATED_DEPTH) || defined(FFX_INTERNAL) +void StoreDilatedDepth(FFX_PARAMETER_IN FfxUInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32 fDepth) +{ + rw_dilatedDepth[iPxPos] = fDepth; +} +#endif + +#if defined(FSR2_BIND_UAV_DILATED_MOTION_VECTORS) || defined(FFX_INTERNAL) +void StoreDilatedMotionVector(FFX_PARAMETER_IN FfxUInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x2 fMotionVector) +{ + rw_dilated_motion_vectors[iPxPos] = fMotionVector; +} +#endif + +#if defined(FSR2_BIND_SRV_DILATED_MOTION_VECTORS) || defined(FFX_INTERNAL) +FfxFloat32x2 LoadDilatedMotionVector(FfxUInt32x2 iPxInput) +{ + return r_dilated_motion_vectors[iPxInput].xy; +} +#endif + +#if defined(FSR2_BIND_SRV_PREVIOUS_DILATED_MOTION_VECTORS) || defined(FFX_INTERNAL) +FfxFloat32x2 LoadPreviousDilatedMotionVector(FfxUInt32x2 iPxInput) +{ + return r_previous_dilated_motion_vectors[iPxInput].xy; +} + +FfxFloat32x2 SamplePreviousDilatedMotionVector(FfxFloat32x2 uv) +{ + return r_previous_dilated_motion_vectors.SampleLevel(s_LinearClamp, uv, 0).xy; +} +#endif + +#if defined(FSR2_BIND_SRV_DILATED_DEPTH) || defined(FFX_INTERNAL) +FfxFloat32 LoadDilatedDepth(FfxUInt32x2 iPxInput) +{ + return r_dilatedDepth[iPxInput]; +} +#endif + +#if defined(FSR2_BIND_SRV_INPUT_EXPOSURE) || defined(FFX_INTERNAL) +FfxFloat32 Exposure() +{ + FfxFloat32 exposure = r_input_exposure[FfxUInt32x2(0, 0)].x; + + if (exposure == 0.0f) { + exposure = 1.0f; + } + + return exposure; +} +#endif + +#if defined(FSR2_BIND_SRV_AUTO_EXPOSURE) || defined(FFX_INTERNAL) +FfxFloat32 AutoExposure() +{ + FfxFloat32 exposure = r_auto_exposure[FfxUInt32x2(0, 0)].x; + + if (exposure == 0.0f) { + exposure = 1.0f; + } + + return exposure; +} +#endif + +FfxFloat32 SampleLanczos2Weight(FfxFloat32 x) +{ +#if defined(FSR2_BIND_SRV_LANCZOS_LUT) || defined(FFX_INTERNAL) + return r_lanczos_lut.SampleLevel(s_LinearClamp, FfxFloat32x2(x / 2, 0.5f), 0); +#else + return 0.f; +#endif +} + +#if defined(FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT) || defined(FFX_INTERNAL) +FfxFloat32 SampleUpsampleMaximumBias(FfxFloat32x2 uv) +{ + // Stored as a SNORM, so make sure to multiply by 2 to retrieve the actual expected range. + return FfxFloat32(2.0) * r_upsample_maximum_bias_lut.SampleLevel(s_LinearClamp, abs(uv) * 2.0, 0); +} +#endif + +#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS) || defined(FFX_INTERNAL) +FfxFloat32x2 SampleDilatedReactiveMasks(FfxFloat32x2 fUV) +{ + return r_dilated_reactive_masks.SampleLevel(s_LinearClamp, fUV, 0); +} +#endif + +#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS) || defined(FFX_INTERNAL) +FfxFloat32x2 LoadDilatedReactiveMasks(FFX_PARAMETER_IN FfxUInt32x2 iPxPos) +{ + return r_dilated_reactive_masks[iPxPos]; +} +#endif + +#if defined(FSR2_BIND_UAV_DILATED_REACTIVE_MASKS) || defined(FFX_INTERNAL) +void StoreDilatedReactiveMasks(FFX_PARAMETER_IN FfxUInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x2 fDilatedReactiveMasks) +{ + rw_dilated_reactive_masks[iPxPos] = fDilatedReactiveMasks; +} +#endif + +#if defined(FSR2_BIND_SRV_INPUT_OPAQUE_ONLY) || defined(FFX_INTERNAL) +FfxFloat32x3 LoadOpaqueOnly(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos) +{ + return r_input_opaque_only[iPxPos].xyz; +} +#endif + +#if defined(FSR2_BIND_SRV_PREV_PRE_ALPHA_COLOR) || defined(FFX_INTERNAL) +FfxFloat32x3 LoadPrevPreAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos) +{ + return r_input_prev_color_pre_alpha[iPxPos]; +} +#endif + +#if defined(FSR2_BIND_SRV_PREV_POST_ALPHA_COLOR) || defined(FFX_INTERNAL) +FfxFloat32x3 LoadPrevPostAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos) +{ + return r_input_prev_color_post_alpha[iPxPos]; +} +#endif + +#if defined(FSR2_BIND_UAV_AUTOREACTIVE) || defined(FFX_INTERNAL) +#if defined(FSR2_BIND_UAV_AUTOCOMPOSITION) || defined(FFX_INTERNAL) +void StoreAutoReactive(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_F2 fReactive) +{ + rw_output_autoreactive[iPxPos] = fReactive.x; + + rw_output_autocomposition[iPxPos] = fReactive.y; +} +#endif +#endif + +#if defined(FSR2_BIND_UAV_PREV_PRE_ALPHA_COLOR) || defined(FFX_INTERNAL) +void StorePrevPreAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_F3 color) +{ + rw_output_prev_color_pre_alpha[iPxPos] = color; + +} +#endif + +#if defined(FSR2_BIND_UAV_PREV_POST_ALPHA_COLOR) || defined(FFX_INTERNAL) +void StorePrevPostAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_F3 color) +{ + rw_output_prev_color_post_alpha[iPxPos] = color; +} +#endif + +#endif // #if defined(FFX_GPU) diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_common.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_common.h new file mode 100644 index 00000000000..0c72aa84943 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_common.h @@ -0,0 +1,565 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#if !defined(FFX_FSR2_COMMON_H) +#define FFX_FSR2_COMMON_H + +#if defined(FFX_CPU) || defined(FFX_GPU) +//Locks +#define LOCK_LIFETIME_REMAINING 0 +#define LOCK_TEMPORAL_LUMA 1 +#endif // #if defined(FFX_CPU) || defined(FFX_GPU) + +#if defined(FFX_GPU) +FFX_STATIC const FfxFloat32 FSR2_FP16_MIN = 6.10e-05f; +FFX_STATIC const FfxFloat32 FSR2_FP16_MAX = 65504.0f; +FFX_STATIC const FfxFloat32 FSR2_EPSILON = 1e-03f; +FFX_STATIC const FfxFloat32 FSR2_TONEMAP_EPSILON = 1.0f / FSR2_FP16_MAX; +FFX_STATIC const FfxFloat32 FSR2_FLT_MAX = 3.402823466e+38f; +FFX_STATIC const FfxFloat32 FSR2_FLT_MIN = 1.175494351e-38f; + +// treat vector truncation warnings as errors +#pragma warning(error: 3206) + +// suppress warnings +#pragma warning(disable: 3205) // conversion from larger type to smaller +#pragma warning(disable: 3571) // in ffxPow(f, e), f could be negative + +// Reconstructed depth usage +FFX_STATIC const FfxFloat32 fReconstructedDepthBilinearWeightThreshold = 0.01f; + +// Accumulation +FFX_STATIC const FfxFloat32 fUpsampleLanczosWeightScale = 1.0f / 12.0f; +FFX_STATIC const FfxFloat32 fMaxAccumulationLanczosWeight = 1.0f; +FFX_STATIC const FfxFloat32 fAverageLanczosWeightPerFrame = 0.74f * fUpsampleLanczosWeightScale; // Average lanczos weight for jitter accumulated samples +FFX_STATIC const FfxFloat32 fAccumulationMaxOnMotion = 3.0f * fUpsampleLanczosWeightScale; + +// Auto exposure +FFX_STATIC const FfxFloat32 resetAutoExposureAverageSmoothing = 1e8f; + +struct AccumulationPassCommonParams +{ + FfxInt32x2 iPxHrPos; + FfxFloat32x2 fHrUv; + FfxFloat32x2 fLrUv_HwSampler; + FfxFloat32x2 fMotionVector; + FfxFloat32x2 fReprojectedHrUv; + FfxFloat32 fHrVelocity; + FfxFloat32 fDepthClipFactor; + FfxFloat32 fDilatedReactiveFactor; + FfxFloat32 fAccumulationMask; + + FfxBoolean bIsResetFrame; + FfxBoolean bIsExistingSample; + FfxBoolean bIsNewSample; +}; + +struct LockState +{ + FfxBoolean NewLock; //Set for both unique new and re-locked new + FfxBoolean WasLockedPrevFrame; //Set to identify if the pixel was already locked (relock) +}; + +void InitializeNewLockSample(FFX_PARAMETER_OUT FfxFloat32x2 fLockStatus) +{ + fLockStatus = FfxFloat32x2(0, 0); +} + +#if FFX_HALF +void InitializeNewLockSample(FFX_PARAMETER_OUT FFX_MIN16_F2 fLockStatus) +{ + fLockStatus = FFX_MIN16_F2(0, 0); +} +#endif + + +void KillLock(FFX_PARAMETER_INOUT FfxFloat32x2 fLockStatus) +{ + fLockStatus[LOCK_LIFETIME_REMAINING] = 0; +} + +#if FFX_HALF +void KillLock(FFX_PARAMETER_INOUT FFX_MIN16_F2 fLockStatus) +{ + fLockStatus[LOCK_LIFETIME_REMAINING] = FFX_MIN16_F(0); +} +#endif + +struct RectificationBox +{ + FfxFloat32x3 boxCenter; + FfxFloat32x3 boxVec; + FfxFloat32x3 aabbMin; + FfxFloat32x3 aabbMax; + FfxFloat32 fBoxCenterWeight; +}; +#if FFX_HALF +struct RectificationBoxMin16 +{ + FFX_MIN16_F3 boxCenter; + FFX_MIN16_F3 boxVec; + FFX_MIN16_F3 aabbMin; + FFX_MIN16_F3 aabbMax; + FFX_MIN16_F fBoxCenterWeight; +}; +#endif + +void RectificationBoxReset(FFX_PARAMETER_INOUT RectificationBox rectificationBox) +{ + rectificationBox.fBoxCenterWeight = FfxFloat32(0); + + rectificationBox.boxCenter = FfxFloat32x3(0, 0, 0); + rectificationBox.boxVec = FfxFloat32x3(0, 0, 0); + rectificationBox.aabbMin = FfxFloat32x3(FSR2_FLT_MAX, FSR2_FLT_MAX, FSR2_FLT_MAX); + rectificationBox.aabbMax = -FfxFloat32x3(FSR2_FLT_MAX, FSR2_FLT_MAX, FSR2_FLT_MAX); +} +#if FFX_HALF +void RectificationBoxReset(FFX_PARAMETER_INOUT RectificationBoxMin16 rectificationBox) +{ + rectificationBox.fBoxCenterWeight = FFX_MIN16_F(0); + + rectificationBox.boxCenter = FFX_MIN16_F3(0, 0, 0); + rectificationBox.boxVec = FFX_MIN16_F3(0, 0, 0); + rectificationBox.aabbMin = FFX_MIN16_F3(FSR2_FP16_MAX, FSR2_FP16_MAX, FSR2_FP16_MAX); + rectificationBox.aabbMax = -FFX_MIN16_F3(FSR2_FP16_MAX, FSR2_FP16_MAX, FSR2_FP16_MAX); +} +#endif + +void RectificationBoxAddInitialSample(FFX_PARAMETER_INOUT RectificationBox rectificationBox, const FfxFloat32x3 colorSample, const FfxFloat32 fSampleWeight) +{ + rectificationBox.aabbMin = colorSample; + rectificationBox.aabbMax = colorSample; + + FfxFloat32x3 weightedSample = colorSample * fSampleWeight; + rectificationBox.boxCenter = weightedSample; + rectificationBox.boxVec = colorSample * weightedSample; + rectificationBox.fBoxCenterWeight = fSampleWeight; +} + +void RectificationBoxAddSample(FfxBoolean bInitialSample, FFX_PARAMETER_INOUT RectificationBox rectificationBox, const FfxFloat32x3 colorSample, const FfxFloat32 fSampleWeight) +{ + if (bInitialSample) { + RectificationBoxAddInitialSample(rectificationBox, colorSample, fSampleWeight); + } else { + rectificationBox.aabbMin = ffxMin(rectificationBox.aabbMin, colorSample); + rectificationBox.aabbMax = ffxMax(rectificationBox.aabbMax, colorSample); + + FfxFloat32x3 weightedSample = colorSample * fSampleWeight; + rectificationBox.boxCenter += weightedSample; + rectificationBox.boxVec += colorSample * weightedSample; + rectificationBox.fBoxCenterWeight += fSampleWeight; + } +} +#if FFX_HALF +void RectificationBoxAddInitialSample(FFX_PARAMETER_INOUT RectificationBoxMin16 rectificationBox, const FFX_MIN16_F3 colorSample, const FFX_MIN16_F fSampleWeight) +{ + rectificationBox.aabbMin = colorSample; + rectificationBox.aabbMax = colorSample; + + FFX_MIN16_F3 weightedSample = colorSample * fSampleWeight; + rectificationBox.boxCenter = weightedSample; + rectificationBox.boxVec = colorSample * weightedSample; + rectificationBox.fBoxCenterWeight = fSampleWeight; +} + +void RectificationBoxAddSample(FfxBoolean bInitialSample, FFX_PARAMETER_INOUT RectificationBoxMin16 rectificationBox, const FFX_MIN16_F3 colorSample, const FFX_MIN16_F fSampleWeight) +{ + if (bInitialSample) { + RectificationBoxAddInitialSample(rectificationBox, colorSample, fSampleWeight); + } else { + rectificationBox.aabbMin = ffxMin(rectificationBox.aabbMin, colorSample); + rectificationBox.aabbMax = ffxMax(rectificationBox.aabbMax, colorSample); + + FFX_MIN16_F3 weightedSample = colorSample * fSampleWeight; + rectificationBox.boxCenter += weightedSample; + rectificationBox.boxVec += colorSample * weightedSample; + rectificationBox.fBoxCenterWeight += fSampleWeight; + } +} +#endif + +void RectificationBoxComputeVarianceBoxData(FFX_PARAMETER_INOUT RectificationBox rectificationBox) +{ + rectificationBox.fBoxCenterWeight = (abs(rectificationBox.fBoxCenterWeight) > FfxFloat32(FSR2_EPSILON) ? rectificationBox.fBoxCenterWeight : FfxFloat32(1.f)); + rectificationBox.boxCenter /= rectificationBox.fBoxCenterWeight; + rectificationBox.boxVec /= rectificationBox.fBoxCenterWeight; + FfxFloat32x3 stdDev = sqrt(abs(rectificationBox.boxVec - rectificationBox.boxCenter * rectificationBox.boxCenter)); + rectificationBox.boxVec = stdDev; +} +#if FFX_HALF +void RectificationBoxComputeVarianceBoxData(FFX_PARAMETER_INOUT RectificationBoxMin16 rectificationBox) +{ + rectificationBox.fBoxCenterWeight = (abs(rectificationBox.fBoxCenterWeight) > FFX_MIN16_F(FSR2_EPSILON) ? rectificationBox.fBoxCenterWeight : FFX_MIN16_F(1.f)); + rectificationBox.boxCenter /= rectificationBox.fBoxCenterWeight; + rectificationBox.boxVec /= rectificationBox.fBoxCenterWeight; + FFX_MIN16_F3 stdDev = sqrt(abs(rectificationBox.boxVec - rectificationBox.boxCenter * rectificationBox.boxCenter)); + rectificationBox.boxVec = stdDev; +} +#endif + +FfxFloat32x3 SafeRcp3(FfxFloat32x3 v) +{ + return (all(FFX_NOT_EQUAL(v, FfxFloat32x3(0, 0, 0)))) ? (FfxFloat32x3(1, 1, 1) / v) : FfxFloat32x3(0, 0, 0); +} +#if FFX_HALF +FFX_MIN16_F3 SafeRcp3(FFX_MIN16_F3 v) +{ + return (all(FFX_NOT_EQUAL(v, FFX_MIN16_F3(0, 0, 0)))) ? (FFX_MIN16_F3(1, 1, 1) / v) : FFX_MIN16_F3(0, 0, 0); +} +#endif + +FfxFloat32 MinDividedByMax(const FfxFloat32 v0, const FfxFloat32 v1) +{ + const FfxFloat32 m = ffxMax(v0, v1); + return m != 0 ? ffxMin(v0, v1) / m : 0; +} + +#if FFX_HALF +FFX_MIN16_F MinDividedByMax(const FFX_MIN16_F v0, const FFX_MIN16_F v1) +{ + const FFX_MIN16_F m = ffxMax(v0, v1); + return m != FFX_MIN16_F(0) ? ffxMin(v0, v1) / m : FFX_MIN16_F(0); +} +#endif + +FfxFloat32x3 YCoCgToRGB(FfxFloat32x3 fYCoCg) +{ + FfxFloat32x3 fRgb; + + fRgb = FfxFloat32x3( + fYCoCg.x + fYCoCg.y - fYCoCg.z, + fYCoCg.x + fYCoCg.z, + fYCoCg.x - fYCoCg.y - fYCoCg.z); + + return fRgb; +} +#if FFX_HALF +FFX_MIN16_F3 YCoCgToRGB(FFX_MIN16_F3 fYCoCg) +{ + FFX_MIN16_F3 fRgb; + + fRgb = FFX_MIN16_F3( + fYCoCg.x + fYCoCg.y - fYCoCg.z, + fYCoCg.x + fYCoCg.z, + fYCoCg.x - fYCoCg.y - fYCoCg.z); + + return fRgb; +} +#endif + +FfxFloat32x3 RGBToYCoCg(FfxFloat32x3 fRgb) +{ + FfxFloat32x3 fYCoCg; + + fYCoCg = FfxFloat32x3( + 0.25f * fRgb.r + 0.5f * fRgb.g + 0.25f * fRgb.b, + 0.5f * fRgb.r - 0.5f * fRgb.b, + -0.25f * fRgb.r + 0.5f * fRgb.g - 0.25f * fRgb.b); + + return fYCoCg; +} +#if FFX_HALF +FFX_MIN16_F3 RGBToYCoCg(FFX_MIN16_F3 fRgb) +{ + FFX_MIN16_F3 fYCoCg; + + fYCoCg = FFX_MIN16_F3( + 0.25 * fRgb.r + 0.5 * fRgb.g + 0.25 * fRgb.b, + 0.5 * fRgb.r - 0.5 * fRgb.b, + -0.25 * fRgb.r + 0.5 * fRgb.g - 0.25 * fRgb.b); + + return fYCoCg; +} +#endif + +FfxFloat32 RGBToLuma(FfxFloat32x3 fLinearRgb) +{ + return dot(fLinearRgb, FfxFloat32x3(0.2126f, 0.7152f, 0.0722f)); +} +#if FFX_HALF +FFX_MIN16_F RGBToLuma(FFX_MIN16_F3 fLinearRgb) +{ + return dot(fLinearRgb, FFX_MIN16_F3(0.2126f, 0.7152f, 0.0722f)); +} +#endif + +FfxFloat32 RGBToPerceivedLuma(FfxFloat32x3 fLinearRgb) +{ + FfxFloat32 fLuminance = RGBToLuma(fLinearRgb); + + FfxFloat32 fPercievedLuminance = 0; + if (fLuminance <= 216.0f / 24389.0f) { + fPercievedLuminance = fLuminance * (24389.0f / 27.0f); + } + else { + fPercievedLuminance = ffxPow(fLuminance, 1.0f / 3.0f) * 116.0f - 16.0f; + } + + return fPercievedLuminance * 0.01f; +} +#if FFX_HALF +FFX_MIN16_F RGBToPerceivedLuma(FFX_MIN16_F3 fLinearRgb) +{ + FFX_MIN16_F fLuminance = RGBToLuma(fLinearRgb); + + FFX_MIN16_F fPercievedLuminance = FFX_MIN16_F(0); + if (fLuminance <= FFX_MIN16_F(216.0f / 24389.0f)) { + fPercievedLuminance = fLuminance * FFX_MIN16_F(24389.0f / 27.0f); + } + else { + fPercievedLuminance = ffxPow(fLuminance, FFX_MIN16_F(1.0f / 3.0f)) * FFX_MIN16_F(116.0f) - FFX_MIN16_F(16.0f); + } + + return fPercievedLuminance * FFX_MIN16_F(0.01f); +} +#endif + +FfxFloat32x3 Tonemap(FfxFloat32x3 fRgb) +{ + return fRgb / (ffxMax(ffxMax(0.f, fRgb.r), ffxMax(fRgb.g, fRgb.b)) + 1.f).xxx; +} + +FfxFloat32x3 InverseTonemap(FfxFloat32x3 fRgb) +{ + return fRgb / ffxMax(FSR2_TONEMAP_EPSILON, 1.f - ffxMax(fRgb.r, ffxMax(fRgb.g, fRgb.b))).xxx; +} + +#if FFX_HALF +FFX_MIN16_F3 Tonemap(FFX_MIN16_F3 fRgb) +{ + return fRgb / (ffxMax(ffxMax(FFX_MIN16_F(0.f), fRgb.r), ffxMax(fRgb.g, fRgb.b)) + FFX_MIN16_F(1.f)).xxx; +} + +FFX_MIN16_F3 InverseTonemap(FFX_MIN16_F3 fRgb) +{ + return fRgb / ffxMax(FFX_MIN16_F(FSR2_TONEMAP_EPSILON), FFX_MIN16_F(1.f) - ffxMax(fRgb.r, ffxMax(fRgb.g, fRgb.b))).xxx; +} +#endif + +FfxInt32x2 ClampLoad(FfxInt32x2 iPxSample, FfxInt32x2 iPxOffset, FfxInt32x2 iTextureSize) +{ + FfxInt32x2 result = iPxSample + iPxOffset; + result.x = (iPxOffset.x < 0) ? ffxMax(result.x, 0) : result.x; + result.x = (iPxOffset.x > 0) ? ffxMin(result.x, iTextureSize.x - 1) : result.x; + result.y = (iPxOffset.y < 0) ? ffxMax(result.y, 0) : result.y; + result.y = (iPxOffset.y > 0) ? ffxMin(result.y, iTextureSize.y - 1) : result.y; + return result; + + // return ffxMed3(iPxSample + iPxOffset, FfxInt32x2(0, 0), iTextureSize - FfxInt32x2(1, 1)); +} +#if FFX_HALF +FFX_MIN16_I2 ClampLoad(FFX_MIN16_I2 iPxSample, FFX_MIN16_I2 iPxOffset, FFX_MIN16_I2 iTextureSize) +{ + FFX_MIN16_I2 result = iPxSample + iPxOffset; + result.x = (iPxOffset.x < 0) ? ffxMax(result.x, FFX_MIN16_I(0)) : result.x; + result.x = (iPxOffset.x > 0) ? ffxMin(result.x, iTextureSize.x - FFX_MIN16_I(1)) : result.x; + result.y = (iPxOffset.y < 0) ? ffxMax(result.y, FFX_MIN16_I(0)) : result.y; + result.y = (iPxOffset.y > 0) ? ffxMin(result.y, iTextureSize.y - FFX_MIN16_I(1)) : result.y; + return result; + + // return ffxMed3Half(iPxSample + iPxOffset, FFX_MIN16_I2(0, 0), iTextureSize - FFX_MIN16_I2(1, 1)); +} +#endif + +FfxFloat32x2 ClampUv(FfxFloat32x2 fUv, FfxInt32x2 iTextureSize, FfxInt32x2 iResourceSize) +{ + const FfxFloat32x2 fSampleLocation = fUv * iTextureSize; + const FfxFloat32x2 fClampedLocation = ffxMax(FfxFloat32x2(0.5f, 0.5f), ffxMin(fSampleLocation, FfxFloat32x2(iTextureSize) - FfxFloat32x2(0.5f, 0.5f))); + const FfxFloat32x2 fClampedUv = fClampedLocation / FfxFloat32x2(iResourceSize); + + return fClampedUv; +} + +FfxBoolean IsOnScreen(FfxInt32x2 pos, FfxInt32x2 size) +{ + return all(FFX_LESS_THAN(FfxUInt32x2(pos), FfxUInt32x2(size))); +} +#if FFX_HALF +FfxBoolean IsOnScreen(FFX_MIN16_I2 pos, FFX_MIN16_I2 size) +{ + return all(FFX_LESS_THAN(FFX_MIN16_U2(pos), FFX_MIN16_U2(size))); +} +#endif + +FfxFloat32 ComputeAutoExposureFromLavg(FfxFloat32 Lavg) +{ + Lavg = exp(Lavg); + + const FfxFloat32 S = 100.0f; //ISO arithmetic speed + const FfxFloat32 K = 12.5f; + FfxFloat32 ExposureISO100 = log2((Lavg * S) / K); + + const FfxFloat32 q = 0.65f; + FfxFloat32 Lmax = (78.0f / (q * S)) * ffxPow(2.0f, ExposureISO100); + + return 1 / Lmax; +} +#if FFX_HALF +FFX_MIN16_F ComputeAutoExposureFromLavg(FFX_MIN16_F Lavg) +{ + Lavg = exp(Lavg); + + const FFX_MIN16_F S = FFX_MIN16_F(100.0f); //ISO arithmetic speed + const FFX_MIN16_F K = FFX_MIN16_F(12.5f); + const FFX_MIN16_F ExposureISO100 = log2((Lavg * S) / K); + + const FFX_MIN16_F q = FFX_MIN16_F(0.65f); + const FFX_MIN16_F Lmax = (FFX_MIN16_F(78.0f) / (q * S)) * ffxPow(FFX_MIN16_F(2.0f), ExposureISO100); + + return FFX_MIN16_F(1) / Lmax; +} +#endif + +FfxInt32x2 ComputeHrPosFromLrPos(FfxInt32x2 iPxLrPos) +{ + FfxFloat32x2 fSrcJitteredPos = FfxFloat32x2(iPxLrPos) + 0.5f - Jitter(); + FfxFloat32x2 fLrPosInHr = (fSrcJitteredPos / RenderSize()) * DisplaySize(); + FfxInt32x2 iPxHrPos = FfxInt32x2(floor(fLrPosInHr)); + return iPxHrPos; +} +#if FFX_HALF +FFX_MIN16_I2 ComputeHrPosFromLrPos(FFX_MIN16_I2 iPxLrPos) +{ + FFX_MIN16_F2 fSrcJitteredPos = FFX_MIN16_F2(iPxLrPos) + FFX_MIN16_F(0.5f) - FFX_MIN16_F2(Jitter()); + FFX_MIN16_F2 fLrPosInHr = (fSrcJitteredPos / FFX_MIN16_F2(RenderSize())) * FFX_MIN16_F2(DisplaySize()); + FFX_MIN16_I2 iPxHrPos = FFX_MIN16_I2(floor(fLrPosInHr)); + return iPxHrPos; +} +#endif + +FfxFloat32x2 ComputeNdc(FfxFloat32x2 fPxPos, FfxInt32x2 iSize) +{ + return fPxPos / FfxFloat32x2(iSize) * FfxFloat32x2(2.0f, -2.0f) + FfxFloat32x2(-1.0f, 1.0f); +} + +FfxFloat32 GetViewSpaceDepth(FfxFloat32 fDeviceDepth) +{ + const FfxFloat32x4 fDeviceToViewDepth = DeviceToViewSpaceTransformFactors(); + + // fDeviceToViewDepth details found in ffx_fsr2.cpp + return (fDeviceToViewDepth[1] / (fDeviceDepth - fDeviceToViewDepth[0])); +} + +FfxFloat32 GetViewSpaceDepthInMeters(FfxFloat32 fDeviceDepth) +{ + return GetViewSpaceDepth(fDeviceDepth) * ViewSpaceToMetersFactor(); +} + +FfxFloat32x3 GetViewSpacePosition(FfxInt32x2 iViewportPos, FfxInt32x2 iViewportSize, FfxFloat32 fDeviceDepth) +{ + const FfxFloat32x4 fDeviceToViewDepth = DeviceToViewSpaceTransformFactors(); + + const FfxFloat32 Z = GetViewSpaceDepth(fDeviceDepth); + + const FfxFloat32x2 fNdcPos = ComputeNdc(iViewportPos, iViewportSize); + const FfxFloat32 X = fDeviceToViewDepth[2] * fNdcPos.x * Z; + const FfxFloat32 Y = fDeviceToViewDepth[3] * fNdcPos.y * Z; + + return FfxFloat32x3(X, Y, Z); +} + +FfxFloat32x3 GetViewSpacePositionInMeters(FfxInt32x2 iViewportPos, FfxInt32x2 iViewportSize, FfxFloat32 fDeviceDepth) +{ + return GetViewSpacePosition(iViewportPos, iViewportSize, fDeviceDepth) * ViewSpaceToMetersFactor(); +} + +FfxFloat32 GetMaxDistanceInMeters() +{ +#if FFX_FSR2_OPTION_INVERTED_DEPTH + return GetViewSpaceDepth(0.0f) * ViewSpaceToMetersFactor(); +#else + return GetViewSpaceDepth(1.0f) * ViewSpaceToMetersFactor(); +#endif +} + +FfxFloat32x3 PrepareRgb(FfxFloat32x3 fRgb, FfxFloat32 fExposure, FfxFloat32 fPreExposure) +{ + fRgb /= fPreExposure; + fRgb *= fExposure; + + fRgb = clamp(fRgb, 0.0f, FSR2_FP16_MAX); + + return fRgb; +} + +FfxFloat32x3 UnprepareRgb(FfxFloat32x3 fRgb, FfxFloat32 fExposure) +{ + fRgb /= fExposure; + fRgb *= PreExposure(); + + return fRgb; +} + + +struct BilinearSamplingData +{ + FfxInt32x2 iOffsets[4]; + FfxFloat32 fWeights[4]; + FfxInt32x2 iBasePos; +}; + +BilinearSamplingData GetBilinearSamplingData(FfxFloat32x2 fUv, FfxInt32x2 iSize) +{ + BilinearSamplingData data; + + FfxFloat32x2 fPxSample = (fUv * iSize) - FfxFloat32x2(0.5f, 0.5f); + data.iBasePos = FfxInt32x2(floor(fPxSample)); + FfxFloat32x2 fPxFrac = ffxFract(fPxSample); + + data.iOffsets[0] = FfxInt32x2(0, 0); + data.iOffsets[1] = FfxInt32x2(1, 0); + data.iOffsets[2] = FfxInt32x2(0, 1); + data.iOffsets[3] = FfxInt32x2(1, 1); + + data.fWeights[0] = (1 - fPxFrac.x) * (1 - fPxFrac.y); + data.fWeights[1] = (fPxFrac.x) * (1 - fPxFrac.y); + data.fWeights[2] = (1 - fPxFrac.x) * (fPxFrac.y); + data.fWeights[3] = (fPxFrac.x) * (fPxFrac.y); + + return data; +} + +struct PlaneData +{ + FfxFloat32x3 fNormal; + FfxFloat32 fDistanceFromOrigin; +}; + +PlaneData GetPlaneFromPoints(FfxFloat32x3 fP0, FfxFloat32x3 fP1, FfxFloat32x3 fP2) +{ + PlaneData plane; + + FfxFloat32x3 v0 = fP0 - fP1; + FfxFloat32x3 v1 = fP0 - fP2; + plane.fNormal = normalize(cross(v0, v1)); + plane.fDistanceFromOrigin = -dot(fP0, plane.fNormal); + + return plane; +} + +FfxFloat32 PointToPlaneDistance(PlaneData plane, FfxFloat32x3 fPoint) +{ + return abs(dot(plane.fNormal, fPoint) + plane.fDistanceFromOrigin); +} + +#endif // #if defined(FFX_GPU) + +#endif //!defined(FFX_FSR2_COMMON_H) diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid.h new file mode 100644 index 00000000000..c63f1820e08 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid.h @@ -0,0 +1,189 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +FFX_GROUPSHARED FfxUInt32 spdCounter; + +#ifndef SPD_PACKED_ONLY +FFX_GROUPSHARED FfxFloat32 spdIntermediateR[16][16]; +FFX_GROUPSHARED FfxFloat32 spdIntermediateG[16][16]; +FFX_GROUPSHARED FfxFloat32 spdIntermediateB[16][16]; +FFX_GROUPSHARED FfxFloat32 spdIntermediateA[16][16]; + +FfxFloat32x4 SpdLoadSourceImage(FfxFloat32x2 tex, FfxUInt32 slice) +{ + FfxFloat32x2 fUv = (tex + 0.5f + Jitter()) / RenderSize(); + fUv = ClampUv(fUv, RenderSize(), InputColorResourceDimensions()); + FfxFloat32x3 fRgb = SampleInputColor(fUv); + + fRgb /= PreExposure(); + + //compute log luma + const FfxFloat32 fLogLuma = log(ffxMax(FSR2_EPSILON, RGBToLuma(fRgb))); + + // Make sure out of screen pixels contribute no value to the end result + const FfxFloat32 result = all(FFX_LESS_THAN(tex, RenderSize())) ? fLogLuma : 0.0f; + + return FfxFloat32x4(result, 0, 0, 0); +} + +FfxFloat32x4 SpdLoad(FfxInt32x2 tex, FfxUInt32 slice) +{ + return SPD_LoadMipmap5(tex); +} + +void SpdStore(FfxInt32x2 pix, FfxFloat32x4 outValue, FfxUInt32 index, FfxUInt32 slice) +{ + if (index == LumaMipLevelToUse() || index == 5) + { + SPD_SetMipmap(pix, index, outValue.r); + } + + if (index == MipCount() - 1) { //accumulate on 1x1 level + + if (all(FFX_EQUAL(pix, FfxInt32x2(0, 0)))) + { + FfxFloat32 prev = SPD_LoadExposureBuffer().y; + FfxFloat32 result = outValue.r; + + if (prev < resetAutoExposureAverageSmoothing) // Compare Lavg, so small or negative values + { + FfxFloat32 rate = 1.0f; + result = prev + (result - prev) * (1 - exp(-DeltaTime() * rate)); + } + FfxFloat32x2 spdOutput = FfxFloat32x2(ComputeAutoExposureFromLavg(result), result); + SPD_SetExposureBuffer(spdOutput); + } + } +} + +void SpdIncreaseAtomicCounter(FfxUInt32 slice) +{ + SPD_IncreaseAtomicCounter(spdCounter); +} + +FfxUInt32 SpdGetAtomicCounter() +{ + return spdCounter; +} + +void SpdResetAtomicCounter(FfxUInt32 slice) +{ + SPD_ResetAtomicCounter(); +} + +FfxFloat32x4 SpdLoadIntermediate(FfxUInt32 x, FfxUInt32 y) +{ + return FfxFloat32x4( + spdIntermediateR[x][y], + spdIntermediateG[x][y], + spdIntermediateB[x][y], + spdIntermediateA[x][y]); +} +void SpdStoreIntermediate(FfxUInt32 x, FfxUInt32 y, FfxFloat32x4 value) +{ + spdIntermediateR[x][y] = value.x; + spdIntermediateG[x][y] = value.y; + spdIntermediateB[x][y] = value.z; + spdIntermediateA[x][y] = value.w; +} +FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3) +{ + return (v0 + v1 + v2 + v3) * 0.25f; +} +#endif + +// define fetch and store functions Packed +#if FFX_HALF +#error Callback must be implemented + +FFX_GROUPSHARED FfxFloat16x2 spdIntermediateRG[16][16]; +FFX_GROUPSHARED FfxFloat16x2 spdIntermediateBA[16][16]; + +FfxFloat16x4 SpdLoadSourceImageH(FfxFloat32x2 tex, FfxUInt32 slice) +{ + return FfxFloat16x4(imgDst[0][FfxFloat32x3(tex, slice)]); +} +FfxFloat16x4 SpdLoadH(FfxInt32x2 p, FfxUInt32 slice) +{ + return FfxFloat16x4(imgDst6[FfxUInt32x3(p, slice)]); +} +void SpdStoreH(FfxInt32x2 p, FfxFloat16x4 value, FfxUInt32 mip, FfxUInt32 slice) +{ + if (index == LumaMipLevelToUse() || index == 5) + { + imgDst6[FfxUInt32x3(p, slice)] = FfxFloat32x4(value); + return; + } + imgDst[mip + 1][FfxUInt32x3(p, slice)] = FfxFloat32x4(value); +} +void SpdIncreaseAtomicCounter(FfxUInt32 slice) +{ + InterlockedAdd(rw_spd_global_atomic[FfxInt16x2(0, 0)].counter[slice], 1, spdCounter); +} +FfxUInt32 SpdGetAtomicCounter() +{ + return spdCounter; +} +void SpdResetAtomicCounter(FfxUInt32 slice) +{ + rw_spd_global_atomic[FfxInt16x2(0, 0)].counter[slice] = 0; +} +FfxFloat16x4 SpdLoadIntermediateH(FfxUInt32 x, FfxUInt32 y) +{ + return FfxFloat16x4( + spdIntermediateRG[x][y].x, + spdIntermediateRG[x][y].y, + spdIntermediateBA[x][y].x, + spdIntermediateBA[x][y].y); +} +void SpdStoreIntermediateH(FfxUInt32 x, FfxUInt32 y, FfxFloat16x4 value) +{ + spdIntermediateRG[x][y] = value.xy; + spdIntermediateBA[x][y] = value.zw; +} +FfxFloat16x4 SpdReduce4H(FfxFloat16x4 v0, FfxFloat16x4 v1, FfxFloat16x4 v2, FfxFloat16x4 v3) +{ + return (v0 + v1 + v2 + v3) * FfxFloat16(0.25); +} +#endif + +#include "ffx_spd.h" + +void ComputeAutoExposure(FfxUInt32x3 WorkGroupId, FfxUInt32 LocalThreadIndex) +{ +#if FFX_HALF + SpdDownsampleH( + FfxUInt32x2(WorkGroupId.xy), + FfxUInt32(LocalThreadIndex), + FfxUInt32(MipCount()), + FfxUInt32(NumWorkGroups()), + FfxUInt32(WorkGroupId.z), + FfxUInt32x2(WorkGroupOffset())); +#else + SpdDownsample( + FfxUInt32x2(WorkGroupId.xy), + FfxUInt32(LocalThreadIndex), + FfxUInt32(MipCount()), + FfxUInt32(NumWorkGroups()), + FfxUInt32(WorkGroupId.z), + FfxUInt32x2(WorkGroupOffset())); +#endif +} \ No newline at end of file diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl new file mode 100644 index 00000000000..088e425452c --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl @@ -0,0 +1,134 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + + +#extension GL_GOOGLE_include_directive : require +#extension GL_EXT_samplerless_texture_functions : require + +#define FSR2_BIND_SRV_INPUT_COLOR 0 +#define FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC 1 +#define FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE 2 +#define FSR2_BIND_UAV_EXPOSURE_MIP_5 3 +#define FSR2_BIND_UAV_AUTO_EXPOSURE 4 +#define FSR2_BIND_CB_FSR2 5 +#define FSR2_BIND_CB_SPD 6 + +#include "ffx_fsr2_callbacks_glsl.h" +#include "ffx_fsr2_common.h" + +#if defined(FSR2_BIND_CB_SPD) + layout (set = 1, binding = FSR2_BIND_CB_SPD, std140) uniform cbSPD_t + { + uint mips; + uint numWorkGroups; + uvec2 workGroupOffset; + uvec2 renderSize; + } cbSPD; + + uint MipCount() + { + return cbSPD.mips; + } + + uint NumWorkGroups() + { + return cbSPD.numWorkGroups; + } + + uvec2 WorkGroupOffset() + { + return cbSPD.workGroupOffset; + } + + uvec2 SPD_RenderSize() + { + return cbSPD.renderSize; + } +#endif + +vec2 SPD_LoadExposureBuffer() +{ + return imageLoad(rw_auto_exposure, ivec2(0,0)).xy; +} + +void SPD_SetExposureBuffer(vec2 value) +{ + imageStore(rw_auto_exposure, ivec2(0,0), vec4(value, 0.0f, 0.0f)); +} + +vec4 SPD_LoadMipmap5(ivec2 iPxPos) +{ + return vec4(imageLoad(rw_img_mip_5, iPxPos).x, 0.0f, 0.0f, 0.0f); +} + +void SPD_SetMipmap(ivec2 iPxPos, uint slice, float value) +{ + switch (slice) + { + case FFX_FSR2_SHADING_CHANGE_MIP_LEVEL: + imageStore(rw_img_mip_shading_change, iPxPos, vec4(value, 0.0f, 0.0f, 0.0f)); + break; + case 5: + imageStore(rw_img_mip_5, iPxPos, vec4(value, 0.0f, 0.0f, 0.0f)); + break; + default: + + // avoid flattened side effect +#if defined(FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE) + imageStore(rw_img_mip_shading_change, iPxPos, vec4(imageLoad(rw_img_mip_shading_change, iPxPos).x, 0.0f, 0.0f, 0.0f)); +#elif defined(FSR2_BIND_UAV_EXPOSURE_MIP_5) + imageStore(rw_img_mip_5, iPxPos, vec4(imageLoad(rw_img_mip_5, iPxPos).x, 0.0f, 0.0f, 0.0f)); +#endif + break; + } +} + +void SPD_IncreaseAtomicCounter(inout uint spdCounter) +{ + spdCounter = imageAtomicAdd(rw_spd_global_atomic, ivec2(0,0), 1); +} + +void SPD_ResetAtomicCounter() +{ + imageStore(rw_spd_global_atomic, ivec2(0,0), uvec4(0)); +} + +#include "ffx_fsr2_compute_luminance_pyramid.h" + +#ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#define FFX_FSR2_THREAD_GROUP_WIDTH 256 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT +#define FFX_FSR2_THREAD_GROUP_HEIGHT 1 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_HEIGHT +#ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#define FFX_FSR2_THREAD_GROUP_DEPTH 1 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#ifndef FFX_FSR2_NUM_THREADS +#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in; +#endif // #ifndef FFX_FSR2_NUM_THREADS + +FFX_FSR2_NUM_THREADS +void main() +{ + ComputeAutoExposure(gl_WorkGroupID.xyz, gl_LocalInvocationIndex); +} \ No newline at end of file diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip.h new file mode 100644 index 00000000000..fa4c975a23f --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip.h @@ -0,0 +1,258 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef FFX_FSR2_DEPTH_CLIP_H +#define FFX_FSR2_DEPTH_CLIP_H + +FFX_STATIC const FfxFloat32 DepthClipBaseScale = 4.0f; + +FfxFloat32 ComputeDepthClip(FfxFloat32x2 fUvSample, FfxFloat32 fCurrentDepthSample) +{ + FfxFloat32 fCurrentDepthViewSpace = GetViewSpaceDepth(fCurrentDepthSample); + BilinearSamplingData bilinearInfo = GetBilinearSamplingData(fUvSample, RenderSize()); + + FfxFloat32 fDilatedSum = 0.0f; + FfxFloat32 fDepth = 0.0f; + FfxFloat32 fWeightSum = 0.0f; + for (FfxInt32 iSampleIndex = 0; iSampleIndex < 4; iSampleIndex++) { + + const FfxInt32x2 iOffset = bilinearInfo.iOffsets[iSampleIndex]; + const FfxInt32x2 iSamplePos = bilinearInfo.iBasePos + iOffset; + + if (IsOnScreen(iSamplePos, RenderSize())) { + const FfxFloat32 fWeight = bilinearInfo.fWeights[iSampleIndex]; + if (fWeight > fReconstructedDepthBilinearWeightThreshold) { + + const FfxFloat32 fPrevDepthSample = LoadReconstructedPrevDepth(iSamplePos); + const FfxFloat32 fPrevNearestDepthViewSpace = GetViewSpaceDepth(fPrevDepthSample); + + const FfxFloat32 fDepthDiff = fCurrentDepthViewSpace - fPrevNearestDepthViewSpace; + + if (fDepthDiff > 0.0f) { + +#if FFX_FSR2_OPTION_INVERTED_DEPTH + const FfxFloat32 fPlaneDepth = ffxMin(fPrevDepthSample, fCurrentDepthSample); +#else + const FfxFloat32 fPlaneDepth = ffxMax(fPrevDepthSample, fCurrentDepthSample); +#endif + + const FfxFloat32x3 fCenter = GetViewSpacePosition(FfxInt32x2(RenderSize() * 0.5f), RenderSize(), fPlaneDepth); + const FfxFloat32x3 fCorner = GetViewSpacePosition(FfxInt32x2(0, 0), RenderSize(), fPlaneDepth); + + const FfxFloat32 fHalfViewportWidth = length(FfxFloat32x2(RenderSize())); + const FfxFloat32 fDepthThreshold = ffxMax(fCurrentDepthViewSpace, fPrevNearestDepthViewSpace); + + const FfxFloat32 Ksep = 1.37e-05f; + const FfxFloat32 Kfov = length(fCorner) / length(fCenter); + const FfxFloat32 fRequiredDepthSeparation = Ksep * Kfov * fHalfViewportWidth * fDepthThreshold; + + const FfxFloat32 fResolutionFactor = ffxSaturate(length(FfxFloat32x2(RenderSize())) / length(FfxFloat32x2(1920.0f, 1080.0f))); + const FfxFloat32 fPower = ffxLerp(1.0f, 3.0f, fResolutionFactor); + fDepth += ffxPow(ffxSaturate(FfxFloat32(fRequiredDepthSeparation / fDepthDiff)), fPower) * fWeight; + fWeightSum += fWeight; + } + } + } + } + + return (fWeightSum > 0) ? ffxSaturate(1.0f - fDepth / fWeightSum) : 0.0f; +} + +FfxFloat32 ComputeMotionDivergence(FfxInt32x2 iPxPos, FfxInt32x2 iPxInputMotionVectorSize) +{ + FfxFloat32 minconvergence = 1.0f; + + FfxFloat32x2 fMotionVectorNucleus = LoadInputMotionVector(iPxPos); + FfxFloat32 fNucleusVelocityLr = length(fMotionVectorNucleus * RenderSize()); + FfxFloat32 fMaxVelocityUv = length(fMotionVectorNucleus); + + const FfxFloat32 MotionVectorVelocityEpsilon = 1e-02f; + + if (fNucleusVelocityLr > MotionVectorVelocityEpsilon) { + for (FfxInt32 y = -1; y <= 1; ++y) { + for (FfxInt32 x = -1; x <= 1; ++x) { + + FfxInt32x2 sp = ClampLoad(iPxPos, FfxInt32x2(x, y), iPxInputMotionVectorSize); + + FfxFloat32x2 fMotionVector = LoadInputMotionVector(sp); + FfxFloat32 fVelocityUv = length(fMotionVector); + + fMaxVelocityUv = ffxMax(fVelocityUv, fMaxVelocityUv); + fVelocityUv = ffxMax(fVelocityUv, fMaxVelocityUv); + minconvergence = ffxMin(minconvergence, dot(fMotionVector / fVelocityUv, fMotionVectorNucleus / fVelocityUv)); + } + } + } + + return ffxSaturate(1.0f - minconvergence) * ffxSaturate(fMaxVelocityUv / 0.01f); +} + +FfxFloat32 ComputeDepthDivergence(FfxInt32x2 iPxPos) +{ + const FfxFloat32 fMaxDistInMeters = GetMaxDistanceInMeters(); + FfxFloat32 fDepthMax = 0.0f; + FfxFloat32 fDepthMin = fMaxDistInMeters; + + FfxInt32 iMaxDistFound = 0; + + for (FfxInt32 y = -1; y < 2; y++) { + for (FfxInt32 x = -1; x < 2; x++) { + + const FfxInt32x2 iOffset = FfxInt32x2(x, y); + const FfxInt32x2 iSamplePos = iPxPos + iOffset; + + const FfxFloat32 fOnScreenFactor = IsOnScreen(iSamplePos, RenderSize()) ? 1.0f : 0.0f; + FfxFloat32 fDepth = GetViewSpaceDepthInMeters(LoadDilatedDepth(iSamplePos)) * fOnScreenFactor; + + iMaxDistFound |= FfxInt32(fMaxDistInMeters == fDepth); + + fDepthMin = ffxMin(fDepthMin, fDepth); + fDepthMax = ffxMax(fDepthMax, fDepth); + } + } + + return (1.0f - fDepthMin / fDepthMax) * (FfxBoolean(iMaxDistFound) ? 0.0f : 1.0f); +} + +FfxFloat32 ComputeTemporalMotionDivergence(FfxInt32x2 iPxPos) +{ + const FfxFloat32x2 fUv = FfxFloat32x2(iPxPos + 0.5f) / RenderSize(); + + FfxFloat32x2 fMotionVector = LoadDilatedMotionVector(iPxPos); + FfxFloat32x2 fReprojectedUv = fUv + fMotionVector; + fReprojectedUv = ClampUv(fReprojectedUv, RenderSize(), MaxRenderSize()); + FfxFloat32x2 fPrevMotionVector = SamplePreviousDilatedMotionVector(fReprojectedUv); + + float fPxDistance = length(fMotionVector * DisplaySize()); + return fPxDistance > 1.0f ? ffxLerp(0.0f, 1.0f - ffxSaturate(length(fPrevMotionVector) / length(fMotionVector)), ffxSaturate(ffxPow(fPxDistance / 20.0f, 3.0f))) : 0; +} + +void PreProcessReactiveMasks(FfxInt32x2 iPxLrPos, FfxFloat32 fMotionDivergence) +{ + // Compensate for bilinear sampling in accumulation pass + + FfxFloat32x3 fReferenceColor = LoadInputColor(iPxLrPos).xyz; + FfxFloat32x2 fReactiveFactor = FfxFloat32x2(0.0f, fMotionDivergence); + + float fMasksSum = 0.0f; + + FfxFloat32x3 fColorSamples[9]; + FfxFloat32 fReactiveSamples[9]; + FfxFloat32 fTransparencyAndCompositionSamples[9]; + + FFX_UNROLL + for (FfxInt32 y = -1; y < 2; y++) { + FFX_UNROLL + for (FfxInt32 x = -1; x < 2; x++) { + + const FfxInt32x2 sampleCoord = ClampLoad(iPxLrPos, FfxInt32x2(x, y), FfxInt32x2(RenderSize())); + + FfxInt32 sampleIdx = (y + 1) * 3 + x + 1; + + FfxFloat32x3 fColorSample = LoadInputColor(sampleCoord).xyz; + FfxFloat32 fReactiveSample = LoadReactiveMask(sampleCoord); + FfxFloat32 fTransparencyAndCompositionSample = LoadTransparencyAndCompositionMask(sampleCoord); + + fColorSamples[sampleIdx] = fColorSample; + fReactiveSamples[sampleIdx] = fReactiveSample; + fTransparencyAndCompositionSamples[sampleIdx] = fTransparencyAndCompositionSample; + + fMasksSum += (fReactiveSample + fTransparencyAndCompositionSample); + } + } + + if (fMasksSum > 0) + { + for (FfxInt32 sampleIdx = 0; sampleIdx < 9; sampleIdx++) + { + FfxFloat32x3 fColorSample = fColorSamples[sampleIdx]; + FfxFloat32 fReactiveSample = fReactiveSamples[sampleIdx]; + FfxFloat32 fTransparencyAndCompositionSample = fTransparencyAndCompositionSamples[sampleIdx]; + + const FfxFloat32 fMaxLenSq = ffxMax(dot(fReferenceColor, fReferenceColor), dot(fColorSample, fColorSample)); + const FfxFloat32 fSimilarity = dot(fReferenceColor, fColorSample) / fMaxLenSq; + + // Increase power for non-similar samples + const FfxFloat32 fPowerBiasMax = 6.0f; + const FfxFloat32 fSimilarityPower = 1.0f + (fPowerBiasMax - fSimilarity * fPowerBiasMax); + const FfxFloat32 fWeightedReactiveSample = ffxPow(fReactiveSample, fSimilarityPower); + const FfxFloat32 fWeightedTransparencyAndCompositionSample = ffxPow(fTransparencyAndCompositionSample, fSimilarityPower); + + fReactiveFactor = ffxMax(fReactiveFactor, FfxFloat32x2(fWeightedReactiveSample, fWeightedTransparencyAndCompositionSample)); + } + } + + StoreDilatedReactiveMasks(iPxLrPos, fReactiveFactor); +} + +FfxFloat32x3 ComputePreparedInputColor(FfxInt32x2 iPxLrPos) +{ + //We assume linear data. if non-linear input (sRGB, ...), + //then we should convert to linear first and back to sRGB on output. + FfxFloat32x3 fRgb = ffxMax(FfxFloat32x3(0, 0, 0), LoadInputColor(iPxLrPos)); + + fRgb = PrepareRgb(fRgb, Exposure(), PreExposure()); + + const FfxFloat32x3 fPreparedYCoCg = RGBToYCoCg(fRgb); + + return fPreparedYCoCg; +} + +FfxFloat32 EvaluateSurface(FfxInt32x2 iPxPos, FfxFloat32x2 fMotionVector) +{ + FfxFloat32 d0 = GetViewSpaceDepth(LoadReconstructedPrevDepth(iPxPos + FfxInt32x2(0, -1))); + FfxFloat32 d1 = GetViewSpaceDepth(LoadReconstructedPrevDepth(iPxPos + FfxInt32x2(0, 0))); + FfxFloat32 d2 = GetViewSpaceDepth(LoadReconstructedPrevDepth(iPxPos + FfxInt32x2(0, 1))); + + return 1.0f - FfxFloat32(((d0 - d1) > (d1 * 0.01f)) && ((d1 - d2) > (d2 * 0.01f))); +} + +void DepthClip(FfxInt32x2 iPxPos) +{ + FfxFloat32x2 fDepthUv = (iPxPos + 0.5f) / RenderSize(); + FfxFloat32x2 fMotionVector = LoadDilatedMotionVector(iPxPos); + + // Discard tiny mvs + fMotionVector *= FfxFloat32(length(fMotionVector * DisplaySize()) > 0.01f); + + const FfxFloat32x2 fDilatedUv = fDepthUv + fMotionVector; + const FfxFloat32 fDilatedDepth = LoadDilatedDepth(iPxPos); + const FfxFloat32 fCurrentDepthViewSpace = GetViewSpaceDepth(LoadInputDepth(iPxPos)); + + // Compute prepared input color and depth clip + FfxFloat32 fDepthClip = ComputeDepthClip(fDilatedUv, fDilatedDepth) * EvaluateSurface(iPxPos, fMotionVector); + FfxFloat32x3 fPreparedYCoCg = ComputePreparedInputColor(iPxPos); + StorePreparedInputColor(iPxPos, FfxFloat32x4(fPreparedYCoCg, fDepthClip)); + + // Compute dilated reactive mask +#if FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS + FfxInt32x2 iSamplePos = iPxPos; +#else + FfxInt32x2 iSamplePos = ComputeHrPosFromLrPos(iPxPos); +#endif + + FfxFloat32 fMotionDivergence = ComputeMotionDivergence(iSamplePos, RenderSize()); + FfxFloat32 fTemporalMotionDifference = ffxSaturate(ComputeTemporalMotionDivergence(iPxPos) - ComputeDepthDivergence(iPxPos)); + + PreProcessReactiveMasks(iPxPos, ffxMax(fTemporalMotionDifference, fMotionDivergence)); +} + +#endif //!defined( FFX_FSR2_DEPTH_CLIPH ) \ No newline at end of file diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip_pass.glsl new file mode 100644 index 00000000000..65cc8b67eff --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip_pass.glsl @@ -0,0 +1,67 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + + +#extension GL_GOOGLE_include_directive : require +#extension GL_EXT_samplerless_texture_functions : require + +#define FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH 0 +#define FSR2_BIND_SRV_DILATED_MOTION_VECTORS 1 +#define FSR2_BIND_SRV_DILATED_DEPTH 2 +#define FSR2_BIND_SRV_REACTIVE_MASK 3 +#define FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK 4 +#define FSR2_BIND_SRV_PREPARED_INPUT_COLOR 5 +#define FSR2_BIND_SRV_PREVIOUS_DILATED_MOTION_VECTORS 6 +#define FSR2_BIND_SRV_INPUT_MOTION_VECTORS 7 +#define FSR2_BIND_SRV_INPUT_COLOR 8 +#define FSR2_BIND_SRV_INPUT_DEPTH 9 +#define FSR2_BIND_SRV_INPUT_EXPOSURE 10 + +#define FSR2_BIND_UAV_DEPTH_CLIP 11 +#define FSR2_BIND_UAV_DILATED_REACTIVE_MASKS 12 +#define FSR2_BIND_UAV_PREPARED_INPUT_COLOR 13 + +#define FSR2_BIND_CB_FSR2 14 + +#include "ffx_fsr2_callbacks_glsl.h" +#include "ffx_fsr2_common.h" +#include "ffx_fsr2_sample.h" +#include "ffx_fsr2_depth_clip.h" + +#ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#define FFX_FSR2_THREAD_GROUP_WIDTH 8 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT +#define FFX_FSR2_THREAD_GROUP_HEIGHT 8 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_HEIGHT +#ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#define FFX_FSR2_THREAD_GROUP_DEPTH 1 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#ifndef FFX_FSR2_NUM_THREADS +#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in; +#endif // #ifndef FFX_FSR2_NUM_THREADS + +FFX_FSR2_NUM_THREADS +void main() +{ + DepthClip(ivec2(gl_GlobalInvocationID.xy)); +} diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_force16_begin.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_force16_begin.h new file mode 100644 index 00000000000..3bd4d5d9129 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_force16_begin.h @@ -0,0 +1 @@ +// This file doesn't exist in this version of FSR. \ No newline at end of file diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_force16_end.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_force16_end.h new file mode 100644 index 00000000000..3bd4d5d9129 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_force16_end.h @@ -0,0 +1 @@ +// This file doesn't exist in this version of FSR. \ No newline at end of file diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_lock.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_lock.h new file mode 100644 index 00000000000..8347fa86bcd --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_lock.h @@ -0,0 +1,115 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef FFX_FSR2_LOCK_H +#define FFX_FSR2_LOCK_H + +void ClearResourcesForNextFrame(in FfxInt32x2 iPxHrPos) +{ + if (all(FFX_LESS_THAN(iPxHrPos, FfxInt32x2(RenderSize())))) + { +#if FFX_FSR2_OPTION_INVERTED_DEPTH + const FfxUInt32 farZ = 0x0; +#else + const FfxUInt32 farZ = 0x3f800000; +#endif + SetReconstructedDepth(iPxHrPos, farZ); + } +} + +FfxBoolean ComputeThinFeatureConfidence(FfxInt32x2 pos) +{ + const FfxInt32 RADIUS = 1; + + FfxFloat32 fNucleus = LoadLockInputLuma(pos); + + FfxFloat32 similar_threshold = 1.05f; + FfxFloat32 dissimilarLumaMin = FSR2_FLT_MAX; + FfxFloat32 dissimilarLumaMax = 0; + + /* + 0 1 2 + 3 4 5 + 6 7 8 + */ + + #define SETBIT(x) (1U << x) + + FfxUInt32 mask = SETBIT(4); //flag fNucleus as similar + + const FfxUInt32 uNumRejectionMasks = 4; + const FfxUInt32 uRejectionMasks[uNumRejectionMasks] = { + SETBIT(0) | SETBIT(1) | SETBIT(3) | SETBIT(4), //Upper left + SETBIT(1) | SETBIT(2) | SETBIT(4) | SETBIT(5), //Upper right + SETBIT(3) | SETBIT(4) | SETBIT(6) | SETBIT(7), //Lower left + SETBIT(4) | SETBIT(5) | SETBIT(7) | SETBIT(8), //Lower right + }; + + FfxInt32 idx = 0; + FFX_UNROLL + for (FfxInt32 y = -RADIUS; y <= RADIUS; y++) { + FFX_UNROLL + for (FfxInt32 x = -RADIUS; x <= RADIUS; x++, idx++) { + if (x == 0 && y == 0) continue; + + FfxInt32x2 samplePos = ClampLoad(pos, FfxInt32x2(x, y), FfxInt32x2(RenderSize())); + + FfxFloat32 sampleLuma = LoadLockInputLuma(samplePos); + FfxFloat32 difference = ffxMax(sampleLuma, fNucleus) / ffxMin(sampleLuma, fNucleus); + + if (difference > 0 && (difference < similar_threshold)) { + mask |= SETBIT(idx); + } else { + dissimilarLumaMin = ffxMin(dissimilarLumaMin, sampleLuma); + dissimilarLumaMax = ffxMax(dissimilarLumaMax, sampleLuma); + } + } + } + + FfxBoolean isRidge = fNucleus > dissimilarLumaMax || fNucleus < dissimilarLumaMin; + + if (FFX_FALSE == isRidge) { + + return false; + } + + FFX_UNROLL + for (FfxInt32 i = 0; i < 4; i++) { + + if ((mask & uRejectionMasks[i]) == uRejectionMasks[i]) { + return false; + } + } + + return true; +} + +void ComputeLock(FfxInt32x2 iPxLrPos) +{ + if (ComputeThinFeatureConfidence(iPxLrPos)) + { + StoreNewLocks(ComputeHrPosFromLrPos(iPxLrPos), 1.f); + } + + ClearResourcesForNextFrame(iPxLrPos); +} + +#endif // FFX_FSR2_LOCK_H diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_lock_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_lock_pass.glsl new file mode 100644 index 00000000000..0adce1bb112 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_lock_pass.glsl @@ -0,0 +1,56 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + + +#extension GL_GOOGLE_include_directive : require +#extension GL_EXT_samplerless_texture_functions : require + +#define FSR2_BIND_SRV_LOCK_INPUT_LUMA 0 +#define FSR2_BIND_UAV_NEW_LOCKS 1 +#define FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH 2 +#define FSR2_BIND_CB_FSR2 3 + +#include "ffx_fsr2_callbacks_glsl.h" +#include "ffx_fsr2_common.h" +#include "ffx_fsr2_sample.h" +#include "ffx_fsr2_lock.h" + +#ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#define FFX_FSR2_THREAD_GROUP_WIDTH 8 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT +#define FFX_FSR2_THREAD_GROUP_HEIGHT 8 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_HEIGHT +#ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#define FFX_FSR2_THREAD_GROUP_DEPTH 1 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#ifndef FFX_FSR2_NUM_THREADS +#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in; +#endif // #ifndef FFX_FSR2_NUM_THREADS + +FFX_FSR2_NUM_THREADS +void main() +{ + uvec2 uDispatchThreadId = gl_WorkGroupID.xy * uvec2(FFX_FSR2_THREAD_GROUP_WIDTH, FFX_FSR2_THREAD_GROUP_HEIGHT) + gl_LocalInvocationID.xy; + + ComputeLock(ivec2(uDispatchThreadId)); +} diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_postprocess_lock_status.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_postprocess_lock_status.h new file mode 100644 index 00000000000..cee9e148ba1 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_postprocess_lock_status.h @@ -0,0 +1,106 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef FFX_FSR2_POSTPROCESS_LOCK_STATUS_H +#define FFX_FSR2_POSTPROCESS_LOCK_STATUS_H + +FfxFloat32x4 WrapShadingChangeLuma(FfxInt32x2 iPxSample) +{ + return FfxFloat32x4(LoadMipLuma(iPxSample, LumaMipLevelToUse()), 0, 0, 0); +} + +#if FFX_HALF +FFX_MIN16_F4 WrapShadingChangeLuma(FFX_MIN16_I2 iPxSample) +{ + return FFX_MIN16_F4(LoadMipLuma(iPxSample, LumaMipLevelToUse()), 0, 0, 0); +} +#endif + +#if FFX_FSR2_OPTION_POSTPROCESSLOCKSTATUS_SAMPLERS_USE_DATA_HALF && FFX_HALF +DeclareCustomFetchBilinearSamplesMin16(FetchShadingChangeLumaSamples, WrapShadingChangeLuma) +#else +DeclareCustomFetchBicubicSamples(FetchShadingChangeLumaSamples, WrapShadingChangeLuma) +#endif +DeclareCustomTextureSample(ShadingChangeLumaSample, Lanczos2, FetchShadingChangeLumaSamples) + +FfxFloat32 GetShadingChangeLuma(FfxInt32x2 iPxHrPos, FfxFloat32x2 fUvCoord) +{ + FfxFloat32 fShadingChangeLuma = 0; + +#if 0 + fShadingChangeLuma = Exposure() * exp(ShadingChangeLumaSample(fUvCoord, LumaMipDimensions()).x); +#else + + const FfxFloat32 fDiv = FfxFloat32(2 << LumaMipLevelToUse()); + FfxInt32x2 iMipRenderSize = FfxInt32x2(RenderSize() / fDiv); + + fUvCoord = ClampUv(fUvCoord, iMipRenderSize, LumaMipDimensions()); + fShadingChangeLuma = Exposure() * exp(FfxFloat32(SampleMipLuma(fUvCoord, LumaMipLevelToUse()))); +#endif + + fShadingChangeLuma = ffxPow(fShadingChangeLuma, 1.0f / 6.0f); + + return fShadingChangeLuma; +} + +void UpdateLockStatus(AccumulationPassCommonParams params, + FFX_PARAMETER_INOUT FfxFloat32 fReactiveFactor, LockState state, + FFX_PARAMETER_INOUT FfxFloat32x2 fLockStatus, + FFX_PARAMETER_OUT FfxFloat32 fLockContributionThisFrame, + FFX_PARAMETER_OUT FfxFloat32 fLuminanceDiff) { + + const FfxFloat32 fShadingChangeLuma = GetShadingChangeLuma(params.iPxHrPos, params.fHrUv); + + //init temporal shading change factor, init to -1 or so in reproject to know if "true new"? + fLockStatus[LOCK_TEMPORAL_LUMA] = (fLockStatus[LOCK_TEMPORAL_LUMA] == FfxFloat32(0.0f)) ? fShadingChangeLuma : fLockStatus[LOCK_TEMPORAL_LUMA]; + + FfxFloat32 fPreviousShadingChangeLuma = fLockStatus[LOCK_TEMPORAL_LUMA]; + + fLuminanceDiff = 1.0f - MinDividedByMax(fPreviousShadingChangeLuma, fShadingChangeLuma); + + if (state.NewLock) { + fLockStatus[LOCK_TEMPORAL_LUMA] = fShadingChangeLuma; + + fLockStatus[LOCK_LIFETIME_REMAINING] = (fLockStatus[LOCK_LIFETIME_REMAINING] != 0.0f) ? 2.0f : 1.0f; + } + else if(fLockStatus[LOCK_LIFETIME_REMAINING] <= 1.0f) { + fLockStatus[LOCK_TEMPORAL_LUMA] = ffxLerp(fLockStatus[LOCK_TEMPORAL_LUMA], FfxFloat32(fShadingChangeLuma), 0.5f); + } + else { + if (fLuminanceDiff > 0.1f) { + KillLock(fLockStatus); + } + } + + fReactiveFactor = ffxMax(fReactiveFactor, ffxSaturate((fLuminanceDiff - 0.1f) * 10.0f)); + fLockStatus[LOCK_LIFETIME_REMAINING] *= (1.0f - fReactiveFactor); + + fLockStatus[LOCK_LIFETIME_REMAINING] *= ffxSaturate(1.0f - params.fAccumulationMask); + fLockStatus[LOCK_LIFETIME_REMAINING] *= FfxFloat32(params.fDepthClipFactor < 0.1f); + + // Compute this frame lock contribution + const FfxFloat32 fLifetimeContribution = ffxSaturate(fLockStatus[LOCK_LIFETIME_REMAINING] - 1.0f); + const FfxFloat32 fShadingChangeContribution = ffxSaturate(MinDividedByMax(fLockStatus[LOCK_TEMPORAL_LUMA], fShadingChangeLuma)); + + fLockContributionThisFrame = ffxSaturate(ffxSaturate(fLifetimeContribution * 4.0f) * fShadingChangeContribution); +} + +#endif //!defined( FFX_FSR2_POSTPROCESS_LOCK_STATUS_H ) diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas.h new file mode 100644 index 00000000000..d9006cd8ee0 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas.h @@ -0,0 +1,67 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#define GROUP_SIZE 8 + +#define FSR_RCAS_DENOISE 1 + +void WriteUpscaledOutput(FFX_MIN16_U2 iPxHrPos, FfxFloat32x3 fUpscaledColor) +{ + StoreUpscaledOutput(FFX_MIN16_I2(iPxHrPos), fUpscaledColor); +} + +#define FSR_RCAS_F +FfxFloat32x4 FsrRcasLoadF(FfxInt32x2 p) +{ + FfxFloat32x4 fColor = LoadRCAS_Input(p); + + fColor.rgb = PrepareRgb(fColor.rgb, Exposure(), PreExposure()); + + return fColor; +} + +void FsrRcasInputF(inout FfxFloat32 r, inout FfxFloat32 g, inout FfxFloat32 b) {} + +#include "ffx_fsr1.h" + + +void CurrFilter(FFX_MIN16_U2 pos) +{ + FfxFloat32x3 c; + FsrRcasF(c.r, c.g, c.b, pos, RCASConfig()); + + c = UnprepareRgb(c, Exposure()); + + WriteUpscaledOutput(pos, c); +} + +void RCAS(FfxUInt32x3 LocalThreadId, FfxUInt32x3 WorkGroupId, FfxUInt32x3 Dtid) +{ + // Do remapping of local xy in workgroup for a more PS-like swizzle pattern. + FfxUInt32x2 gxy = ffxRemapForQuad(LocalThreadId.x) + FfxUInt32x2(WorkGroupId.x << 4u, WorkGroupId.y << 4u); + CurrFilter(FFX_MIN16_U2(gxy)); + gxy.x += 8u; + CurrFilter(FFX_MIN16_U2(gxy)); + gxy.y += 8u; + CurrFilter(FFX_MIN16_U2(gxy)); + gxy.x -= 8u; + CurrFilter(FFX_MIN16_U2(gxy)); +} diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas_pass.glsl new file mode 100644 index 00000000000..f78fa53e6eb --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas_pass.glsl @@ -0,0 +1,80 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + + +#extension GL_GOOGLE_include_directive : require +#extension GL_EXT_samplerless_texture_functions : require +// Needed for rw_upscaled_output declaration +#extension GL_EXT_shader_image_load_formatted : require + +#define FSR2_BIND_SRV_INPUT_EXPOSURE 0 +#define FSR2_BIND_SRV_RCAS_INPUT 1 +#define FSR2_BIND_UAV_UPSCALED_OUTPUT 2 +#define FSR2_BIND_CB_FSR2 3 +#define FSR2_BIND_CB_RCAS 4 + +#include "ffx_fsr2_callbacks_glsl.h" +#include "ffx_fsr2_common.h" + +//Move to prototype shader! +#if defined(FSR2_BIND_CB_RCAS) + layout (set = 1, binding = FSR2_BIND_CB_RCAS, std140) uniform cbRCAS_t + { + uvec4 rcasConfig; + } cbRCAS; + + uvec4 RCASConfig() + { + return cbRCAS.rcasConfig; + } +#else + uvec4 RCASConfig() + { + return uvec4(0); + } +#endif + +vec4 LoadRCAS_Input(FfxInt32x2 iPxPos) +{ + return texelFetch(r_rcas_input, iPxPos, 0); +} + +#include "ffx_fsr2_rcas.h" + +#ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#define FFX_FSR2_THREAD_GROUP_WIDTH 64 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT +#define FFX_FSR2_THREAD_GROUP_HEIGHT 1 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_HEIGHT +#ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#define FFX_FSR2_THREAD_GROUP_DEPTH 1 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#ifndef FFX_FSR2_NUM_THREADS +#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in; +#endif // #ifndef FFX_FSR2_NUM_THREADS + +FFX_FSR2_NUM_THREADS +void main() +{ + RCAS(gl_LocalInvocationID.xyz, gl_WorkGroupID.xyz, gl_GlobalInvocationID.xyz); +} \ No newline at end of file diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_dilated_velocity_and_previous_depth.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_dilated_velocity_and_previous_depth.h new file mode 100644 index 00000000000..e9ccc4bc8c2 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_dilated_velocity_and_previous_depth.h @@ -0,0 +1,145 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef FFX_FSR2_RECONSTRUCT_DILATED_VELOCITY_AND_PREVIOUS_DEPTH_H +#define FFX_FSR2_RECONSTRUCT_DILATED_VELOCITY_AND_PREVIOUS_DEPTH_H + +void ReconstructPrevDepth(FfxInt32x2 iPxPos, FfxFloat32 fDepth, FfxFloat32x2 fMotionVector, FfxInt32x2 iPxDepthSize) +{ + fMotionVector *= FfxFloat32(length(fMotionVector * DisplaySize()) > 0.1f); + + FfxFloat32x2 fUv = (iPxPos + FfxFloat32(0.5)) / iPxDepthSize; + FfxFloat32x2 fReprojectedUv = fUv + fMotionVector; + + BilinearSamplingData bilinearInfo = GetBilinearSamplingData(fReprojectedUv, RenderSize()); + + // Project current depth into previous frame locations. + // Push to all pixels having some contribution if reprojection is using bilinear logic. + for (FfxInt32 iSampleIndex = 0; iSampleIndex < 4; iSampleIndex++) { + + const FfxInt32x2 iOffset = bilinearInfo.iOffsets[iSampleIndex]; + FfxFloat32 fWeight = bilinearInfo.fWeights[iSampleIndex]; + + if (fWeight > fReconstructedDepthBilinearWeightThreshold) { + + FfxInt32x2 iStorePos = bilinearInfo.iBasePos + iOffset; + if (IsOnScreen(iStorePos, iPxDepthSize)) { + StoreReconstructedDepth(iStorePos, fDepth); + } + } + } +} + +void FindNearestDepth(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxInt32x2 iPxSize, FFX_PARAMETER_OUT FfxFloat32 fNearestDepth, FFX_PARAMETER_OUT FfxInt32x2 fNearestDepthCoord) +{ + const FfxInt32 iSampleCount = 9; + const FfxInt32x2 iSampleOffsets[iSampleCount] = { + FfxInt32x2(+0, +0), + FfxInt32x2(+1, +0), + FfxInt32x2(+0, +1), + FfxInt32x2(+0, -1), + FfxInt32x2(-1, +0), + FfxInt32x2(-1, +1), + FfxInt32x2(+1, +1), + FfxInt32x2(-1, -1), + FfxInt32x2(+1, -1), + }; + + // pull out the depth loads to allow SC to batch them + FfxFloat32 depth[9]; + FfxInt32 iSampleIndex = 0; + FFX_UNROLL + for (iSampleIndex = 0; iSampleIndex < iSampleCount; ++iSampleIndex) { + + FfxInt32x2 iPos = iPxPos + iSampleOffsets[iSampleIndex]; + depth[iSampleIndex] = LoadInputDepth(iPos); + } + + // find closest depth + fNearestDepthCoord = iPxPos; + fNearestDepth = depth[0]; + FFX_UNROLL + for (iSampleIndex = 1; iSampleIndex < iSampleCount; ++iSampleIndex) { + + FfxInt32x2 iPos = iPxPos + iSampleOffsets[iSampleIndex]; + if (IsOnScreen(iPos, iPxSize)) { + + FfxFloat32 fNdDepth = depth[iSampleIndex]; +#if FFX_FSR2_OPTION_INVERTED_DEPTH + if (fNdDepth > fNearestDepth) { +#else + if (fNdDepth < fNearestDepth) { +#endif + fNearestDepthCoord = iPos; + fNearestDepth = fNdDepth; + } + } + } +} + +FfxFloat32 ComputeLockInputLuma(FfxInt32x2 iPxLrPos) +{ + //We assume linear data. if non-linear input (sRGB, ...), + //then we should convert to linear first and back to sRGB on output. + FfxFloat32x3 fRgb = ffxMax(FfxFloat32x3(0, 0, 0), LoadInputColor(iPxLrPos)); + + // Use internal auto exposure for locking logic + fRgb /= PreExposure(); + fRgb *= Exposure(); + +#if FFX_FSR2_OPTION_HDR_COLOR_INPUT + fRgb = Tonemap(fRgb); +#endif + + //compute luma used to lock pixels, if used elsewhere the ffxPow must be moved! + const FfxFloat32 fLockInputLuma = ffxPow(RGBToPerceivedLuma(fRgb), FfxFloat32(1.0 / 6.0)); + + return fLockInputLuma; +} + +void ReconstructAndDilate(FfxInt32x2 iPxLrPos) +{ + FfxFloat32 fDilatedDepth; + FfxInt32x2 iNearestDepthCoord; + + FindNearestDepth(iPxLrPos, RenderSize(), fDilatedDepth, iNearestDepthCoord); + +#if FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS + FfxInt32x2 iSamplePos = iPxLrPos; + FfxInt32x2 iMotionVectorPos = iNearestDepthCoord; +#else + FfxInt32x2 iSamplePos = ComputeHrPosFromLrPos(iPxLrPos); + FfxInt32x2 iMotionVectorPos = ComputeHrPosFromLrPos(iNearestDepthCoord); +#endif + + FfxFloat32x2 fDilatedMotionVector = LoadInputMotionVector(iMotionVectorPos); + + StoreDilatedDepth(iPxLrPos, fDilatedDepth); + StoreDilatedMotionVector(iPxLrPos, fDilatedMotionVector); + + ReconstructPrevDepth(iPxLrPos, fDilatedDepth, fDilatedMotionVector, RenderSize()); + + FfxFloat32 fLockInputLuma = ComputeLockInputLuma(iPxLrPos); + StoreLockInputLuma(iPxLrPos, fLockInputLuma); +} + + +#endif //!defined( FFX_FSR2_RECONSTRUCT_DILATED_VELOCITY_AND_PREVIOUS_DEPTH_H ) diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl new file mode 100644 index 00000000000..25c18c0622d --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl @@ -0,0 +1,65 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + + +#extension GL_GOOGLE_include_directive : require +#extension GL_EXT_samplerless_texture_functions : require + +#define FSR2_BIND_SRV_INPUT_MOTION_VECTORS 0 +#define FSR2_BIND_SRV_INPUT_DEPTH 1 +#define FSR2_BIND_SRV_INPUT_COLOR 2 +#define FSR2_BIND_SRV_INPUT_EXPOSURE 3 +#define FSR2_BIND_SRV_LUMA_HISTORY 4 + +#define FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH 5 +#define FSR2_BIND_UAV_DILATED_MOTION_VECTORS 6 +#define FSR2_BIND_UAV_DILATED_DEPTH 7 +#define FSR2_BIND_UAV_PREPARED_INPUT_COLOR 8 +#define FSR2_BIND_UAV_LUMA_HISTORY 9 +#define FSR2_BIND_UAV_LUMA_INSTABILITY 10 +#define FSR2_BIND_UAV_LOCK_INPUT_LUMA 11 + +#define FSR2_BIND_CB_FSR2 12 + +#include "ffx_fsr2_callbacks_glsl.h" +#include "ffx_fsr2_common.h" +#include "ffx_fsr2_sample.h" +#include "ffx_fsr2_reconstruct_dilated_velocity_and_previous_depth.h" + +#ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#define FFX_FSR2_THREAD_GROUP_WIDTH 8 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT +#define FFX_FSR2_THREAD_GROUP_HEIGHT 8 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_HEIGHT +#ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#define FFX_FSR2_THREAD_GROUP_DEPTH 1 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#ifndef FFX_FSR2_NUM_THREADS +#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in; +#endif // #ifndef FFX_FSR2_NUM_THREADS + +FFX_FSR2_NUM_THREADS +void main() +{ + ReconstructAndDilate(FFX_MIN16_I2(gl_GlobalInvocationID.xy)); +} diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_reproject.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_reproject.h new file mode 100644 index 00000000000..f7f396129e1 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_reproject.h @@ -0,0 +1,136 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef FFX_FSR2_REPROJECT_H +#define FFX_FSR2_REPROJECT_H + +#ifndef FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE +#define FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE 0 // Reference +#endif + +FfxFloat32x4 WrapHistory(FfxInt32x2 iPxSample) +{ + return LoadHistory(iPxSample); +} + +#if FFX_HALF +FFX_MIN16_F4 WrapHistory(FFX_MIN16_I2 iPxSample) +{ + return FFX_MIN16_F4(LoadHistory(iPxSample)); +} +#endif + + +#if FFX_FSR2_OPTION_REPROJECT_SAMPLERS_USE_DATA_HALF && FFX_HALF +DeclareCustomFetchBicubicSamplesMin16(FetchHistorySamples, WrapHistory) +DeclareCustomTextureSampleMin16(HistorySample, FFX_FSR2_GET_LANCZOS_SAMPLER1D(FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE), FetchHistorySamples) +#else +DeclareCustomFetchBicubicSamples(FetchHistorySamples, WrapHistory) +DeclareCustomTextureSample(HistorySample, FFX_FSR2_GET_LANCZOS_SAMPLER1D(FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE), FetchHistorySamples) +#endif + +FfxFloat32x4 WrapLockStatus(FfxInt32x2 iPxSample) +{ + FfxFloat32x4 fSample = FfxFloat32x4(LoadLockStatus(iPxSample), 0.0f, 0.0f); + return fSample; +} + +#if FFX_HALF +FFX_MIN16_F4 WrapLockStatus(FFX_MIN16_I2 iPxSample) +{ + FFX_MIN16_F4 fSample = FFX_MIN16_F4(LoadLockStatus(iPxSample), 0.0, 0.0); + + return fSample; +} +#endif + +#if 1 +#if FFX_FSR2_OPTION_REPROJECT_SAMPLERS_USE_DATA_HALF && FFX_HALF +DeclareCustomFetchBilinearSamplesMin16(FetchLockStatusSamples, WrapLockStatus) +DeclareCustomTextureSampleMin16(LockStatusSample, Bilinear, FetchLockStatusSamples) +#else +DeclareCustomFetchBilinearSamples(FetchLockStatusSamples, WrapLockStatus) +DeclareCustomTextureSample(LockStatusSample, Bilinear, FetchLockStatusSamples) +#endif +#else +#if FFX_FSR2_OPTION_REPROJECT_SAMPLERS_USE_DATA_HALF && FFX_HALF +DeclareCustomFetchBicubicSamplesMin16(FetchLockStatusSamples, WrapLockStatus) +DeclareCustomTextureSampleMin16(LockStatusSample, FFX_FSR2_GET_LANCZOS_SAMPLER1D(FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE), FetchLockStatusSamples) +#else +DeclareCustomFetchBicubicSamples(FetchLockStatusSamples, WrapLockStatus) +DeclareCustomTextureSample(LockStatusSample, FFX_FSR2_GET_LANCZOS_SAMPLER1D(FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE), FetchLockStatusSamples) +#endif +#endif + +FfxFloat32x2 GetMotionVector(FfxInt32x2 iPxHrPos, FfxFloat32x2 fHrUv) +{ +#if FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS + FfxFloat32x2 fDilatedMotionVector = LoadDilatedMotionVector(FFX_MIN16_I2(fHrUv * RenderSize())); +#else + FfxFloat32x2 fDilatedMotionVector = LoadInputMotionVector(iPxHrPos); +#endif + + return fDilatedMotionVector; +} + +FfxBoolean IsUvInside(FfxFloat32x2 fUv) +{ + return (fUv.x >= 0.0f && fUv.x <= 1.0f) && (fUv.y >= 0.0f && fUv.y <= 1.0f); +} + +void ComputeReprojectedUVs(const AccumulationPassCommonParams params, FFX_PARAMETER_OUT FfxFloat32x2 fReprojectedHrUv, FFX_PARAMETER_OUT FfxBoolean bIsExistingSample) +{ + fReprojectedHrUv = params.fHrUv + params.fMotionVector; + + bIsExistingSample = IsUvInside(fReprojectedHrUv); +} + +void ReprojectHistoryColor(const AccumulationPassCommonParams params, FFX_PARAMETER_OUT FfxFloat32x3 fHistoryColor, FFX_PARAMETER_OUT FfxFloat32 fTemporalReactiveFactor, FFX_PARAMETER_OUT FfxBoolean bInMotionLastFrame) +{ + FfxFloat32x4 fHistory = HistorySample(params.fReprojectedHrUv, DisplaySize()); + + fHistoryColor = PrepareRgb(fHistory.rgb, Exposure(), PreviousFramePreExposure()); + + fHistoryColor = RGBToYCoCg(fHistoryColor); + + //Compute temporal reactivity info + fTemporalReactiveFactor = ffxSaturate(abs(fHistory.w)); + bInMotionLastFrame = (fHistory.w < 0.0f); +} + +LockState ReprojectHistoryLockStatus(const AccumulationPassCommonParams params, FFX_PARAMETER_OUT FfxFloat32x2 fReprojectedLockStatus) +{ + LockState state = { FFX_FALSE, FFX_FALSE }; + const FfxFloat32 fNewLockIntensity = LoadRwNewLocks(params.iPxHrPos); + state.NewLock = fNewLockIntensity > (127.0f / 255.0f); + + FfxFloat32 fInPlaceLockLifetime = state.NewLock ? fNewLockIntensity : 0; + + fReprojectedLockStatus = SampleLockStatus(params.fReprojectedHrUv); + + if (fReprojectedLockStatus[LOCK_LIFETIME_REMAINING] != FfxFloat32(0.0f)) { + state.WasLockedPrevFrame = true; + } + + return state; +} + +#endif //!defined( FFX_FSR2_REPROJECT_H ) diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_resources.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_resources.h new file mode 100644 index 00000000000..535dbc383c7 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_resources.h @@ -0,0 +1,105 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef FFX_FSR2_RESOURCES_H +#define FFX_FSR2_RESOURCES_H + +#if defined(FFX_CPU) || defined(FFX_GPU) +#define FFX_FSR2_RESOURCE_IDENTIFIER_NULL 0 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_OPAQUE_ONLY 1 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_COLOR 2 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_MOTION_VECTORS 3 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_DEPTH 4 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_EXPOSURE 5 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_REACTIVE_MASK 6 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_TRANSPARENCY_AND_COMPOSITION_MASK 7 +#define FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH 8 +#define FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS 9 +#define FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH 10 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR 11 +#define FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS 12 +#define FFX_FSR2_RESOURCE_IDENTIFIER_NEW_LOCKS 13 +#define FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR 14 +#define FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY 15 +#define FFX_FSR2_RESOURCE_IDENTIFIER_DEBUG_OUTPUT 16 +#define FFX_FSR2_RESOURCE_IDENTIFIER_LANCZOS_LUT 17 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SPD_ATOMIC_COUNT 18 +#define FFX_FSR2_RESOURCE_IDENTIFIER_UPSCALED_OUTPUT 19 +#define FFX_FSR2_RESOURCE_IDENTIFIER_RCAS_INPUT 20 +#define FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS_1 21 +#define FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS_2 22 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR_1 23 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR_2 24 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DEFAULT_REACTIVITY 25 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DEFAULT_TRANSPARENCY_AND_COMPOSITION 26 +#define FFX_FSR2_RESOURCE_IDENTITIER_UPSAMPLE_MAXIMUM_BIAS_LUT 27 +#define FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS 28 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE 29 // same as FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_0 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_0 29 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_1 30 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_2 31 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_3 32 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_4 33 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_5 34 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_6 35 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_7 36 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_8 37 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_9 38 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_10 39 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_11 40 +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_12 41 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DEFAULT_EXPOSURE 42 +#define FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE 43 +#define FFX_FSR2_RESOURCE_IDENTIFIER_AUTOREACTIVE 44 +#define FFX_FSR2_RESOURCE_IDENTIFIER_AUTOCOMPOSITION 45 + +#define FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR 46 +#define FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR 47 +#define FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR_1 48 +#define FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR_1 49 +#define FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR_2 50 +#define FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR_2 51 +#define FFX_FSR2_RESOURCE_IDENTIFIER_PREVIOUS_DILATED_MOTION_VECTORS 52 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DILATED_MOTION_VECTORS_1 53 +#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DILATED_MOTION_VECTORS_2 54 +#define FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY_1 55 +#define FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY_2 56 +#define FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_INPUT_LUMA 57 + +// Shading change detection mip level setting, value must be in the range [FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_0, FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_12] +#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_SHADING_CHANGE FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_4 +#define FFX_FSR2_SHADING_CHANGE_MIP_LEVEL (FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_SHADING_CHANGE - FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE) + +#define FFX_FSR2_RESOURCE_IDENTIFIER_COUNT 58 + +#define FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_FSR2 0 +#define FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_SPD 1 +#define FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_RCAS 2 +#define FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_GENREACTIVE 3 + +#define FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_TONEMAP 1 +#define FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_INVERSETONEMAP 2 +#define FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_THRESHOLD 4 +#define FFX_FSR2_AUTOREACTIVEFLAGS_USE_COMPONENTS_MAX 8 + +#endif // #if defined(FFX_CPU) || defined(FFX_GPU) + +#endif //!defined( FFX_FSR2_RESOURCES_H ) diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_sample.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_sample.h new file mode 100644 index 00000000000..f94f40aa793 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_sample.h @@ -0,0 +1,605 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef FFX_FSR2_SAMPLE_H +#define FFX_FSR2_SAMPLE_H + +// suppress warnings +#ifdef FFX_HLSL +#pragma warning(disable: 4008) // potentially divide by zero +#endif //FFX_HLSL + +struct FetchedBilinearSamples { + + FfxFloat32x4 fColor00; + FfxFloat32x4 fColor10; + + FfxFloat32x4 fColor01; + FfxFloat32x4 fColor11; +}; + +struct FetchedBicubicSamples { + + FfxFloat32x4 fColor00; + FfxFloat32x4 fColor10; + FfxFloat32x4 fColor20; + FfxFloat32x4 fColor30; + + FfxFloat32x4 fColor01; + FfxFloat32x4 fColor11; + FfxFloat32x4 fColor21; + FfxFloat32x4 fColor31; + + FfxFloat32x4 fColor02; + FfxFloat32x4 fColor12; + FfxFloat32x4 fColor22; + FfxFloat32x4 fColor32; + + FfxFloat32x4 fColor03; + FfxFloat32x4 fColor13; + FfxFloat32x4 fColor23; + FfxFloat32x4 fColor33; +}; + +#if FFX_HALF +struct FetchedBilinearSamplesMin16 { + + FFX_MIN16_F4 fColor00; + FFX_MIN16_F4 fColor10; + + FFX_MIN16_F4 fColor01; + FFX_MIN16_F4 fColor11; +}; + +struct FetchedBicubicSamplesMin16 { + + FFX_MIN16_F4 fColor00; + FFX_MIN16_F4 fColor10; + FFX_MIN16_F4 fColor20; + FFX_MIN16_F4 fColor30; + + FFX_MIN16_F4 fColor01; + FFX_MIN16_F4 fColor11; + FFX_MIN16_F4 fColor21; + FFX_MIN16_F4 fColor31; + + FFX_MIN16_F4 fColor02; + FFX_MIN16_F4 fColor12; + FFX_MIN16_F4 fColor22; + FFX_MIN16_F4 fColor32; + + FFX_MIN16_F4 fColor03; + FFX_MIN16_F4 fColor13; + FFX_MIN16_F4 fColor23; + FFX_MIN16_F4 fColor33; +}; +#else //FFX_HALF +#define FetchedBicubicSamplesMin16 FetchedBicubicSamples +#define FetchedBilinearSamplesMin16 FetchedBilinearSamples +#endif //FFX_HALF + +FfxFloat32x4 Linear(FfxFloat32x4 A, FfxFloat32x4 B, FfxFloat32 t) +{ + return A + (B - A) * t; +} + +FfxFloat32x4 Bilinear(FetchedBilinearSamples BilinearSamples, FfxFloat32x2 fPxFrac) +{ + FfxFloat32x4 fColorX0 = Linear(BilinearSamples.fColor00, BilinearSamples.fColor10, fPxFrac.x); + FfxFloat32x4 fColorX1 = Linear(BilinearSamples.fColor01, BilinearSamples.fColor11, fPxFrac.x); + FfxFloat32x4 fColorXY = Linear(fColorX0, fColorX1, fPxFrac.y); + return fColorXY; +} + +#if FFX_HALF +FFX_MIN16_F4 Linear(FFX_MIN16_F4 A, FFX_MIN16_F4 B, FFX_MIN16_F t) +{ + return A + (B - A) * t; +} + +FFX_MIN16_F4 Bilinear(FetchedBilinearSamplesMin16 BilinearSamples, FFX_MIN16_F2 fPxFrac) +{ + FFX_MIN16_F4 fColorX0 = Linear(BilinearSamples.fColor00, BilinearSamples.fColor10, fPxFrac.x); + FFX_MIN16_F4 fColorX1 = Linear(BilinearSamples.fColor01, BilinearSamples.fColor11, fPxFrac.x); + FFX_MIN16_F4 fColorXY = Linear(fColorX0, fColorX1, fPxFrac.y); + return fColorXY; +} +#endif + +FfxFloat32 Lanczos2NoClamp(FfxFloat32 x) +{ + const FfxFloat32 PI = 3.141592653589793f; // TODO: share SDK constants + return abs(x) < FSR2_EPSILON ? 1.f : (sin(PI * x) / (PI * x)) * (sin(0.5f * PI * x) / (0.5f * PI * x)); +} + +FfxFloat32 Lanczos2(FfxFloat32 x) +{ + x = ffxMin(abs(x), 2.0f); + return Lanczos2NoClamp(x); +} + +#if FFX_HALF + +#if 0 +FFX_MIN16_F Lanczos2NoClamp(FFX_MIN16_F x) +{ + const FFX_MIN16_F PI = FFX_MIN16_F(3.141592653589793f); // TODO: share SDK constants + return abs(x) < FFX_MIN16_F(FSR2_EPSILON) ? FFX_MIN16_F(1.f) : (sin(PI * x) / (PI * x)) * (sin(FFX_MIN16_F(0.5f) * PI * x) / (FFX_MIN16_F(0.5f) * PI * x)); +} +#endif + +FFX_MIN16_F Lanczos2(FFX_MIN16_F x) +{ + x = ffxMin(abs(x), FFX_MIN16_F(2.0f)); + return FFX_MIN16_F(Lanczos2NoClamp(x)); +} +#endif //FFX_HALF + +// FSR1 lanczos approximation. Input is x*x and must be <= 4. +FfxFloat32 Lanczos2ApproxSqNoClamp(FfxFloat32 x2) +{ + FfxFloat32 a = (2.0f / 5.0f) * x2 - 1; + FfxFloat32 b = (1.0f / 4.0f) * x2 - 1; + return ((25.0f / 16.0f) * a * a - (25.0f / 16.0f - 1)) * (b * b); +} + +#if FFX_HALF +FFX_MIN16_F Lanczos2ApproxSqNoClamp(FFX_MIN16_F x2) +{ + FFX_MIN16_F a = FFX_MIN16_F(2.0f / 5.0f) * x2 - FFX_MIN16_F(1); + FFX_MIN16_F b = FFX_MIN16_F(1.0f / 4.0f) * x2 - FFX_MIN16_F(1); + return (FFX_MIN16_F(25.0f / 16.0f) * a * a - FFX_MIN16_F(25.0f / 16.0f - 1)) * (b * b); +} +#endif //FFX_HALF + +FfxFloat32 Lanczos2ApproxSq(FfxFloat32 x2) +{ + x2 = ffxMin(x2, 4.0f); + return Lanczos2ApproxSqNoClamp(x2); +} + +#if FFX_HALF +FFX_MIN16_F Lanczos2ApproxSq(FFX_MIN16_F x2) +{ + x2 = ffxMin(x2, FFX_MIN16_F(4.0f)); + return Lanczos2ApproxSqNoClamp(x2); +} +#endif //FFX_HALF + +FfxFloat32 Lanczos2ApproxNoClamp(FfxFloat32 x) +{ + return Lanczos2ApproxSqNoClamp(x * x); +} + +#if FFX_HALF +FFX_MIN16_F Lanczos2ApproxNoClamp(FFX_MIN16_F x) +{ + return Lanczos2ApproxSqNoClamp(x * x); +} +#endif //FFX_HALF + +FfxFloat32 Lanczos2Approx(FfxFloat32 x) +{ + return Lanczos2ApproxSq(x * x); +} + +#if FFX_HALF +FFX_MIN16_F Lanczos2Approx(FFX_MIN16_F x) +{ + return Lanczos2ApproxSq(x * x); +} +#endif //FFX_HALF + +FfxFloat32 Lanczos2_UseLUT(FfxFloat32 x) +{ + return SampleLanczos2Weight(abs(x)); +} + +#if FFX_HALF +FFX_MIN16_F Lanczos2_UseLUT(FFX_MIN16_F x) +{ + return FFX_MIN16_F(SampleLanczos2Weight(abs(x))); +} +#endif //FFX_HALF + +FfxFloat32x4 Lanczos2_UseLUT(FfxFloat32x4 fColor0, FfxFloat32x4 fColor1, FfxFloat32x4 fColor2, FfxFloat32x4 fColor3, FfxFloat32 t) +{ + FfxFloat32 fWeight0 = Lanczos2_UseLUT(-1.f - t); + FfxFloat32 fWeight1 = Lanczos2_UseLUT(-0.f - t); + FfxFloat32 fWeight2 = Lanczos2_UseLUT(+1.f - t); + FfxFloat32 fWeight3 = Lanczos2_UseLUT(+2.f - t); + return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3); +} +#if FFX_HALF +FFX_MIN16_F4 Lanczos2_UseLUT(FFX_MIN16_F4 fColor0, FFX_MIN16_F4 fColor1, FFX_MIN16_F4 fColor2, FFX_MIN16_F4 fColor3, FFX_MIN16_F t) +{ + FFX_MIN16_F fWeight0 = Lanczos2_UseLUT(FFX_MIN16_F(-1.f) - t); + FFX_MIN16_F fWeight1 = Lanczos2_UseLUT(FFX_MIN16_F(-0.f) - t); + FFX_MIN16_F fWeight2 = Lanczos2_UseLUT(FFX_MIN16_F(+1.f) - t); + FFX_MIN16_F fWeight3 = Lanczos2_UseLUT(FFX_MIN16_F(+2.f) - t); + return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3); +} +#endif + +FfxFloat32x4 Lanczos2(FfxFloat32x4 fColor0, FfxFloat32x4 fColor1, FfxFloat32x4 fColor2, FfxFloat32x4 fColor3, FfxFloat32 t) +{ + FfxFloat32 fWeight0 = Lanczos2(-1.f - t); + FfxFloat32 fWeight1 = Lanczos2(-0.f - t); + FfxFloat32 fWeight2 = Lanczos2(+1.f - t); + FfxFloat32 fWeight3 = Lanczos2(+2.f - t); + return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3); +} + +FfxFloat32x4 Lanczos2(FetchedBicubicSamples Samples, FfxFloat32x2 fPxFrac) +{ + FfxFloat32x4 fColorX0 = Lanczos2(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x); + FfxFloat32x4 fColorX1 = Lanczos2(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x); + FfxFloat32x4 fColorX2 = Lanczos2(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x); + FfxFloat32x4 fColorX3 = Lanczos2(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x); + FfxFloat32x4 fColorXY = Lanczos2(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); + + // Deringing + + // TODO: only use 4 by checking jitter + const FfxInt32 iDeringingSampleCount = 4; + const FfxFloat32x4 fDeringingSamples[4] = { + Samples.fColor11, + Samples.fColor21, + Samples.fColor12, + Samples.fColor22, + }; + + FfxFloat32x4 fDeringingMin = fDeringingSamples[0]; + FfxFloat32x4 fDeringingMax = fDeringingSamples[0]; + + FFX_UNROLL + for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex) { + + fDeringingMin = ffxMin(fDeringingMin, fDeringingSamples[iSampleIndex]); + fDeringingMax = ffxMax(fDeringingMax, fDeringingSamples[iSampleIndex]); + } + + fColorXY = clamp(fColorXY, fDeringingMin, fDeringingMax); + + return fColorXY; +} + +#if FFX_HALF +FFX_MIN16_F4 Lanczos2(FFX_MIN16_F4 fColor0, FFX_MIN16_F4 fColor1, FFX_MIN16_F4 fColor2, FFX_MIN16_F4 fColor3, FFX_MIN16_F t) +{ + FFX_MIN16_F fWeight0 = Lanczos2(FFX_MIN16_F(-1.f) - t); + FFX_MIN16_F fWeight1 = Lanczos2(FFX_MIN16_F(-0.f) - t); + FFX_MIN16_F fWeight2 = Lanczos2(FFX_MIN16_F(+1.f) - t); + FFX_MIN16_F fWeight3 = Lanczos2(FFX_MIN16_F(+2.f) - t); + return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3); +} + +FFX_MIN16_F4 Lanczos2(FetchedBicubicSamplesMin16 Samples, FFX_MIN16_F2 fPxFrac) +{ + FFX_MIN16_F4 fColorX0 = Lanczos2(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x); + FFX_MIN16_F4 fColorX1 = Lanczos2(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x); + FFX_MIN16_F4 fColorX2 = Lanczos2(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x); + FFX_MIN16_F4 fColorX3 = Lanczos2(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x); + FFX_MIN16_F4 fColorXY = Lanczos2(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); + + // Deringing + + // TODO: only use 4 by checking jitter + const FfxInt32 iDeringingSampleCount = 4; + const FFX_MIN16_F4 fDeringingSamples[4] = { + Samples.fColor11, + Samples.fColor21, + Samples.fColor12, + Samples.fColor22, + }; + + FFX_MIN16_F4 fDeringingMin = fDeringingSamples[0]; + FFX_MIN16_F4 fDeringingMax = fDeringingSamples[0]; + + FFX_UNROLL + for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex) + { + fDeringingMin = ffxMin(fDeringingMin, fDeringingSamples[iSampleIndex]); + fDeringingMax = ffxMax(fDeringingMax, fDeringingSamples[iSampleIndex]); + } + + fColorXY = clamp(fColorXY, fDeringingMin, fDeringingMax); + + return fColorXY; +} +#endif //FFX_HALF + + +FfxFloat32x4 Lanczos2LUT(FetchedBicubicSamples Samples, FfxFloat32x2 fPxFrac) +{ + FfxFloat32x4 fColorX0 = Lanczos2_UseLUT(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x); + FfxFloat32x4 fColorX1 = Lanczos2_UseLUT(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x); + FfxFloat32x4 fColorX2 = Lanczos2_UseLUT(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x); + FfxFloat32x4 fColorX3 = Lanczos2_UseLUT(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x); + FfxFloat32x4 fColorXY = Lanczos2_UseLUT(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); + + // Deringing + + // TODO: only use 4 by checking jitter + const FfxInt32 iDeringingSampleCount = 4; + const FfxFloat32x4 fDeringingSamples[4] = { + Samples.fColor11, + Samples.fColor21, + Samples.fColor12, + Samples.fColor22, + }; + + FfxFloat32x4 fDeringingMin = fDeringingSamples[0]; + FfxFloat32x4 fDeringingMax = fDeringingSamples[0]; + + FFX_UNROLL + for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex) { + + fDeringingMin = ffxMin(fDeringingMin, fDeringingSamples[iSampleIndex]); + fDeringingMax = ffxMax(fDeringingMax, fDeringingSamples[iSampleIndex]); + } + + fColorXY = clamp(fColorXY, fDeringingMin, fDeringingMax); + + return fColorXY; +} + +#if FFX_HALF +FFX_MIN16_F4 Lanczos2LUT(FetchedBicubicSamplesMin16 Samples, FFX_MIN16_F2 fPxFrac) +{ + FFX_MIN16_F4 fColorX0 = Lanczos2_UseLUT(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x); + FFX_MIN16_F4 fColorX1 = Lanczos2_UseLUT(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x); + FFX_MIN16_F4 fColorX2 = Lanczos2_UseLUT(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x); + FFX_MIN16_F4 fColorX3 = Lanczos2_UseLUT(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x); + FFX_MIN16_F4 fColorXY = Lanczos2_UseLUT(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); + + // Deringing + + // TODO: only use 4 by checking jitter + const FfxInt32 iDeringingSampleCount = 4; + const FFX_MIN16_F4 fDeringingSamples[4] = { + Samples.fColor11, + Samples.fColor21, + Samples.fColor12, + Samples.fColor22, + }; + + FFX_MIN16_F4 fDeringingMin = fDeringingSamples[0]; + FFX_MIN16_F4 fDeringingMax = fDeringingSamples[0]; + + FFX_UNROLL + for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex) + { + fDeringingMin = ffxMin(fDeringingMin, fDeringingSamples[iSampleIndex]); + fDeringingMax = ffxMax(fDeringingMax, fDeringingSamples[iSampleIndex]); + } + + fColorXY = clamp(fColorXY, fDeringingMin, fDeringingMax); + + return fColorXY; +} +#endif //FFX_HALF + + + +FfxFloat32x4 Lanczos2Approx(FfxFloat32x4 fColor0, FfxFloat32x4 fColor1, FfxFloat32x4 fColor2, FfxFloat32x4 fColor3, FfxFloat32 t) +{ + FfxFloat32 fWeight0 = Lanczos2ApproxNoClamp(-1.f - t); + FfxFloat32 fWeight1 = Lanczos2ApproxNoClamp(-0.f - t); + FfxFloat32 fWeight2 = Lanczos2ApproxNoClamp(+1.f - t); + FfxFloat32 fWeight3 = Lanczos2ApproxNoClamp(+2.f - t); + return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3); +} + +#if FFX_HALF +FFX_MIN16_F4 Lanczos2Approx(FFX_MIN16_F4 fColor0, FFX_MIN16_F4 fColor1, FFX_MIN16_F4 fColor2, FFX_MIN16_F4 fColor3, FFX_MIN16_F t) +{ + FFX_MIN16_F fWeight0 = Lanczos2ApproxNoClamp(FFX_MIN16_F(-1.f) - t); + FFX_MIN16_F fWeight1 = Lanczos2ApproxNoClamp(FFX_MIN16_F(-0.f) - t); + FFX_MIN16_F fWeight2 = Lanczos2ApproxNoClamp(FFX_MIN16_F(+1.f) - t); + FFX_MIN16_F fWeight3 = Lanczos2ApproxNoClamp(FFX_MIN16_F(+2.f) - t); + return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3); +} +#endif //FFX_HALF + +FfxFloat32x4 Lanczos2Approx(FetchedBicubicSamples Samples, FfxFloat32x2 fPxFrac) +{ + FfxFloat32x4 fColorX0 = Lanczos2Approx(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x); + FfxFloat32x4 fColorX1 = Lanczos2Approx(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x); + FfxFloat32x4 fColorX2 = Lanczos2Approx(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x); + FfxFloat32x4 fColorX3 = Lanczos2Approx(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x); + FfxFloat32x4 fColorXY = Lanczos2Approx(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); + + // Deringing + + // TODO: only use 4 by checking jitter + const FfxInt32 iDeringingSampleCount = 4; + const FfxFloat32x4 fDeringingSamples[4] = { + Samples.fColor11, + Samples.fColor21, + Samples.fColor12, + Samples.fColor22, + }; + + FfxFloat32x4 fDeringingMin = fDeringingSamples[0]; + FfxFloat32x4 fDeringingMax = fDeringingSamples[0]; + + FFX_UNROLL + for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex) + { + fDeringingMin = ffxMin(fDeringingMin, fDeringingSamples[iSampleIndex]); + fDeringingMax = ffxMax(fDeringingMax, fDeringingSamples[iSampleIndex]); + } + + fColorXY = clamp(fColorXY, fDeringingMin, fDeringingMax); + + return fColorXY; +} + +#if FFX_HALF +FFX_MIN16_F4 Lanczos2Approx(FetchedBicubicSamplesMin16 Samples, FFX_MIN16_F2 fPxFrac) +{ + FFX_MIN16_F4 fColorX0 = Lanczos2Approx(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x); + FFX_MIN16_F4 fColorX1 = Lanczos2Approx(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x); + FFX_MIN16_F4 fColorX2 = Lanczos2Approx(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x); + FFX_MIN16_F4 fColorX3 = Lanczos2Approx(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x); + FFX_MIN16_F4 fColorXY = Lanczos2Approx(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); + + // Deringing + + // TODO: only use 4 by checking jitter + const FfxInt32 iDeringingSampleCount = 4; + const FFX_MIN16_F4 fDeringingSamples[4] = { + Samples.fColor11, + Samples.fColor21, + Samples.fColor12, + Samples.fColor22, + }; + + FFX_MIN16_F4 fDeringingMin = fDeringingSamples[0]; + FFX_MIN16_F4 fDeringingMax = fDeringingSamples[0]; + + FFX_UNROLL + for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex) + { + fDeringingMin = ffxMin(fDeringingMin, fDeringingSamples[iSampleIndex]); + fDeringingMax = ffxMax(fDeringingMax, fDeringingSamples[iSampleIndex]); + } + + fColorXY = clamp(fColorXY, fDeringingMin, fDeringingMax); + + return fColorXY; +} +#endif + +// Clamp by offset direction. Assuming iPxSample is already in range and iPxOffset is compile time constant. +FfxInt32x2 ClampCoord(FfxInt32x2 iPxSample, FfxInt32x2 iPxOffset, FfxInt32x2 iTextureSize) +{ + FfxInt32x2 result = iPxSample + iPxOffset; + result.x = (iPxOffset.x < 0) ? ffxMax(result.x, 0) : result.x; + result.x = (iPxOffset.x > 0) ? ffxMin(result.x, iTextureSize.x - 1) : result.x; + result.y = (iPxOffset.y < 0) ? ffxMax(result.y, 0) : result.y; + result.y = (iPxOffset.y > 0) ? ffxMin(result.y, iTextureSize.y - 1) : result.y; + return result; +} +#if FFX_HALF +FFX_MIN16_I2 ClampCoord(FFX_MIN16_I2 iPxSample, FFX_MIN16_I2 iPxOffset, FFX_MIN16_I2 iTextureSize) +{ + FFX_MIN16_I2 result = iPxSample + iPxOffset; + result.x = (iPxOffset.x < FFX_MIN16_I(0)) ? ffxMax(result.x, FFX_MIN16_I(0)) : result.x; + result.x = (iPxOffset.x > FFX_MIN16_I(0)) ? ffxMin(result.x, iTextureSize.x - FFX_MIN16_I(1)) : result.x; + result.y = (iPxOffset.y < FFX_MIN16_I(0)) ? ffxMax(result.y, FFX_MIN16_I(0)) : result.y; + result.y = (iPxOffset.y > FFX_MIN16_I(0)) ? ffxMin(result.y, iTextureSize.y - FFX_MIN16_I(1)) : result.y; + return result; +} +#endif //FFX_HALF + + +#define DeclareCustomFetchBicubicSamplesWithType(SampleType, TextureType, AddrType, Name, LoadTexture) \ + SampleType Name(AddrType iPxSample, AddrType iTextureSize) \ + { \ + SampleType Samples; \ + \ + Samples.fColor00 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(-1, -1), iTextureSize))); \ + Samples.fColor10 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, -1), iTextureSize))); \ + Samples.fColor20 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, -1), iTextureSize))); \ + Samples.fColor30 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+2, -1), iTextureSize))); \ + \ + Samples.fColor01 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(-1, +0), iTextureSize))); \ + Samples.fColor11 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +0), iTextureSize))); \ + Samples.fColor21 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +0), iTextureSize))); \ + Samples.fColor31 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+2, +0), iTextureSize))); \ + \ + Samples.fColor02 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(-1, +1), iTextureSize))); \ + Samples.fColor12 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +1), iTextureSize))); \ + Samples.fColor22 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +1), iTextureSize))); \ + Samples.fColor32 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+2, +1), iTextureSize))); \ + \ + Samples.fColor03 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(-1, +2), iTextureSize))); \ + Samples.fColor13 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +2), iTextureSize))); \ + Samples.fColor23 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +2), iTextureSize))); \ + Samples.fColor33 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+2, +2), iTextureSize))); \ + \ + return Samples; \ + } + +#define DeclareCustomFetchBicubicSamples(Name, LoadTexture) \ + DeclareCustomFetchBicubicSamplesWithType(FetchedBicubicSamples, FfxFloat32x4, FfxInt32x2, Name, LoadTexture) + +#define DeclareCustomFetchBicubicSamplesMin16(Name, LoadTexture) \ + DeclareCustomFetchBicubicSamplesWithType(FetchedBicubicSamplesMin16, FFX_MIN16_F4, FfxInt32x2, Name, LoadTexture) + +#define DeclareCustomFetchBilinearSamplesWithType(SampleType, TextureType,AddrType, Name, LoadTexture) \ + SampleType Name(AddrType iPxSample, AddrType iTextureSize) \ + { \ + SampleType Samples; \ + Samples.fColor00 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +0), iTextureSize))); \ + Samples.fColor10 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +0), iTextureSize))); \ + Samples.fColor01 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +1), iTextureSize))); \ + Samples.fColor11 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +1), iTextureSize))); \ + return Samples; \ + } + +#define DeclareCustomFetchBilinearSamples(Name, LoadTexture) \ + DeclareCustomFetchBilinearSamplesWithType(FetchedBilinearSamples, FfxFloat32x4, FfxInt32x2, Name, LoadTexture) + +#define DeclareCustomFetchBilinearSamplesMin16(Name, LoadTexture) \ + DeclareCustomFetchBilinearSamplesWithType(FetchedBilinearSamplesMin16, FFX_MIN16_F4, FfxInt32x2, Name, LoadTexture) + +// BE CAREFUL: there is some precision issues and (3253, 125) leading to (3252.9989778, 125.001102) +// is common, so iPxSample can "jitter" +#define DeclareCustomTextureSample(Name, InterpolateSamples, FetchSamples) \ + FfxFloat32x4 Name(FfxFloat32x2 fUvSample, FfxInt32x2 iTextureSize) \ + { \ + FfxFloat32x2 fPxSample = (fUvSample * FfxFloat32x2(iTextureSize)) - FfxFloat32x2(0.5f, 0.5f); \ + /* Clamp base coords */ \ + fPxSample.x = ffxMax(0.0f, ffxMin(FfxFloat32(iTextureSize.x), fPxSample.x)); \ + fPxSample.y = ffxMax(0.0f, ffxMin(FfxFloat32(iTextureSize.y), fPxSample.y)); \ + /* */ \ + FfxInt32x2 iPxSample = FfxInt32x2(floor(fPxSample)); \ + FfxFloat32x2 fPxFrac = ffxFract(fPxSample); \ + FfxFloat32x4 fColorXY = FfxFloat32x4(InterpolateSamples(FetchSamples(iPxSample, iTextureSize), fPxFrac)); \ + return fColorXY; \ + } + +#define DeclareCustomTextureSampleMin16(Name, InterpolateSamples, FetchSamples) \ + FFX_MIN16_F4 Name(FfxFloat32x2 fUvSample, FfxInt32x2 iTextureSize) \ + { \ + FfxFloat32x2 fPxSample = (fUvSample * FfxFloat32x2(iTextureSize)) - FfxFloat32x2(0.5f, 0.5f); \ + /* Clamp base coords */ \ + fPxSample.x = ffxMax(0.0f, ffxMin(FfxFloat32(iTextureSize.x), fPxSample.x)); \ + fPxSample.y = ffxMax(0.0f, ffxMin(FfxFloat32(iTextureSize.y), fPxSample.y)); \ + /* */ \ + FfxInt32x2 iPxSample = FfxInt32x2(floor(fPxSample)); \ + FFX_MIN16_F2 fPxFrac = FFX_MIN16_F2(ffxFract(fPxSample)); \ + FFX_MIN16_F4 fColorXY = FFX_MIN16_F4(InterpolateSamples(FetchSamples(iPxSample, iTextureSize), fPxFrac)); \ + return fColorXY; \ + } + +#define FFX_FSR2_CONCAT_ID(x, y) x ## y +#define FFX_FSR2_CONCAT(x, y) FFX_FSR2_CONCAT_ID(x, y) +#define FFX_FSR2_SAMPLER_1D_0 Lanczos2 +#define FFX_FSR2_SAMPLER_1D_1 Lanczos2LUT +#define FFX_FSR2_SAMPLER_1D_2 Lanczos2Approx + +#define FFX_FSR2_GET_LANCZOS_SAMPLER1D(x) FFX_FSR2_CONCAT(FFX_FSR2_SAMPLER_1D_, x) + +#endif //!defined( FFX_FSR2_SAMPLE_H ) diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen.h new file mode 100644 index 00000000000..101b75d25e4 --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen.h @@ -0,0 +1,250 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#define USE_YCOCG 1 + +#define fAutogenEpsilon 0.01f + +// EXPERIMENTAL + +FFX_MIN16_F ComputeAutoTC_01(FFX_MIN16_I2 uDispatchThreadId, FFX_MIN16_I2 iPrevIdx) +{ + FfxFloat32x3 colorPreAlpha = LoadOpaqueOnly(uDispatchThreadId); + FfxFloat32x3 colorPostAlpha = LoadInputColor(uDispatchThreadId); + FfxFloat32x3 colorPrevPreAlpha = LoadPrevPreAlpha(iPrevIdx); + FfxFloat32x3 colorPrevPostAlpha = LoadPrevPostAlpha(iPrevIdx); + +#if USE_YCOCG + colorPreAlpha = RGBToYCoCg(colorPreAlpha); + colorPostAlpha = RGBToYCoCg(colorPostAlpha); + colorPrevPreAlpha = RGBToYCoCg(colorPrevPreAlpha); + colorPrevPostAlpha = RGBToYCoCg(colorPrevPostAlpha); +#endif + + FfxFloat32x3 colorDeltaCurr = colorPostAlpha - colorPreAlpha; + FfxFloat32x3 colorDeltaPrev = colorPrevPostAlpha - colorPrevPreAlpha; + bool hasAlpha = any(FFX_GREATER_THAN(abs(colorDeltaCurr), FfxFloat32x3(fAutogenEpsilon, fAutogenEpsilon, fAutogenEpsilon))); + bool hadAlpha = any(FFX_GREATER_THAN(abs(colorDeltaPrev), FfxFloat32x3(fAutogenEpsilon, fAutogenEpsilon, fAutogenEpsilon))); + + FfxFloat32x3 X = colorPreAlpha; + FfxFloat32x3 Y = colorPostAlpha; + FfxFloat32x3 Z = colorPrevPreAlpha; + FfxFloat32x3 W = colorPrevPostAlpha; + + FFX_MIN16_F retVal = FFX_MIN16_F(ffxSaturate(dot(abs(abs(Y - X) - abs(W - Z)), FfxFloat32x3(1, 1, 1)))); + + // cleanup very small values + retVal = (retVal < getTcThreshold()) ? FFX_MIN16_F(0.0f) : FFX_MIN16_F(1.f); + + return retVal; +} + +// works ok: thin edges +FFX_MIN16_F ComputeAutoTC_02(FFX_MIN16_I2 uDispatchThreadId, FFX_MIN16_I2 iPrevIdx) +{ + FfxFloat32x3 colorPreAlpha = LoadOpaqueOnly(uDispatchThreadId); + FfxFloat32x3 colorPostAlpha = LoadInputColor(uDispatchThreadId); + FfxFloat32x3 colorPrevPreAlpha = LoadPrevPreAlpha(iPrevIdx); + FfxFloat32x3 colorPrevPostAlpha = LoadPrevPostAlpha(iPrevIdx); + +#if USE_YCOCG + colorPreAlpha = RGBToYCoCg(colorPreAlpha); + colorPostAlpha = RGBToYCoCg(colorPostAlpha); + colorPrevPreAlpha = RGBToYCoCg(colorPrevPreAlpha); + colorPrevPostAlpha = RGBToYCoCg(colorPrevPostAlpha); +#endif + + FfxFloat32x3 colorDelta = colorPostAlpha - colorPreAlpha; + FfxFloat32x3 colorPrevDelta = colorPrevPostAlpha - colorPrevPreAlpha; + bool hasAlpha = any(FFX_GREATER_THAN(abs(colorDelta), FfxFloat32x3(fAutogenEpsilon, fAutogenEpsilon, fAutogenEpsilon))); + bool hadAlpha = any(FFX_GREATER_THAN(abs(colorPrevDelta), FfxFloat32x3(fAutogenEpsilon, fAutogenEpsilon, fAutogenEpsilon))); + + FfxFloat32x3 delta = colorPostAlpha - colorPreAlpha; //prev+1*d = post => d = color, alpha = + FfxFloat32x3 deltaPrev = colorPrevPostAlpha - colorPrevPreAlpha; + + FfxFloat32x3 X = colorPrevPreAlpha; + FfxFloat32x3 N = colorPreAlpha - colorPrevPreAlpha; + FfxFloat32x3 YAminusXA = colorPrevPostAlpha - colorPrevPreAlpha; + FfxFloat32x3 NminusNA = colorPostAlpha - colorPrevPostAlpha; + + FfxFloat32x3 A = (hasAlpha || hadAlpha) ? NminusNA / max(FfxFloat32x3(fAutogenEpsilon, fAutogenEpsilon, fAutogenEpsilon), N) : FfxFloat32x3(0, 0, 0); + + FFX_MIN16_F retVal = FFX_MIN16_F( max(max(A.x, A.y), A.z) ); + + // only pixels that have significantly changed in color shuold be considered + retVal = ffxSaturate(retVal * FFX_MIN16_F(length(colorPostAlpha - colorPrevPostAlpha)) ); + + return retVal; +} + +// This function computes the TransparencyAndComposition mask: +// This mask indicates pixels that should discard locks and apply color clamping. +// +// Typically this is the case for translucent pixels (that don't write depth values) or pixels where the correctness of +// the MVs can not be guaranteed (e.g. procedutal movement or vegetation that does not have MVs to reduce the cost during rasterization) +// Also, large changes in color due to changed lighting should be marked to remove locks on pixels with "old" lighting. +// +// This function takes a opaque only and a final texture and uses internal copies of those textures from the last frame. +// The function tries to determine where the color changes between opaque only and final image to determine the pixels that use transparency. +// Also it uses the previous frames and detects where the use of transparency changed to mark those pixels. +// Additionally it marks pixels where the color changed significantly in the opaque only image, e.g. due to lighting or texture animation. +// +// In the final step it stores the current textures in internal textures for the next frame + +FFX_MIN16_F ComputeTransparencyAndComposition(FFX_MIN16_I2 uDispatchThreadId, FFX_MIN16_I2 iPrevIdx) +{ + FFX_MIN16_F retVal = ComputeAutoTC_02(uDispatchThreadId, iPrevIdx); + + // [branch] + if (retVal > FFX_MIN16_F(0.01f)) + { + retVal = ComputeAutoTC_01(uDispatchThreadId, iPrevIdx); + } + return retVal; +} + +float computeSolidEdge(FFX_MIN16_I2 curPos, FFX_MIN16_I2 prevPos) +{ + float lum[9]; + int i = 0; + for (int y = -1; y < 2; ++y) + { + for (int x = -1; x < 2; ++x) + { + FfxFloat32x3 curCol = LoadOpaqueOnly(curPos + FFX_MIN16_I2(x, y)).rgb; + FfxFloat32x3 prevCol = LoadPrevPreAlpha(prevPos + FFX_MIN16_I2(x, y)).rgb; + lum[i++] = length(curCol - prevCol); + } + } + + //float gradX = abs(lum[3] - lum[4]) + abs(lum[5] - lum[4]); + //float gradY = abs(lum[1] - lum[4]) + abs(lum[7] - lum[4]); + + //return sqrt(gradX * gradX + gradY * gradY); + + float gradX = abs(lum[3] - lum[4]) * abs(lum[5] - lum[4]); + float gradY = abs(lum[1] - lum[4]) * abs(lum[7] - lum[4]); + + return sqrt(sqrt(gradX * gradY)); +} + +float computeAlphaEdge(FFX_MIN16_I2 curPos, FFX_MIN16_I2 prevPos) +{ + float lum[9]; + int i = 0; + for (int y = -1; y < 2; ++y) + { + for (int x = -1; x < 2; ++x) + { + FfxFloat32x3 curCol = abs(LoadInputColor(curPos + FFX_MIN16_I2(x, y)).rgb - LoadOpaqueOnly(curPos + FFX_MIN16_I2(x, y)).rgb); + FfxFloat32x3 prevCol = abs(LoadPrevPostAlpha(prevPos + FFX_MIN16_I2(x, y)).rgb - LoadPrevPreAlpha(prevPos + FFX_MIN16_I2(x, y)).rgb); + lum[i++] = length(curCol - prevCol); + } + } + + //float gradX = abs(lum[3] - lum[4]) + abs(lum[5] - lum[4]); + //float gradY = abs(lum[1] - lum[4]) + abs(lum[7] - lum[4]); + + //return sqrt(gradX * gradX + gradY * gradY); + + float gradX = abs(lum[3] - lum[4]) * abs(lum[5] - lum[4]); + float gradY = abs(lum[1] - lum[4]) * abs(lum[7] - lum[4]); + + return sqrt(sqrt(gradX * gradY)); +} + +FFX_MIN16_F ComputeAabbOverlap(FFX_MIN16_I2 uDispatchThreadId, FFX_MIN16_I2 iPrevIdx) +{ + FFX_MIN16_F retVal = FFX_MIN16_F(0.f); + + FfxFloat32x2 fMotionVector = LoadInputMotionVector(uDispatchThreadId); + FfxFloat32x3 colorPreAlpha = LoadOpaqueOnly(uDispatchThreadId); + FfxFloat32x3 colorPostAlpha = LoadInputColor(uDispatchThreadId); + FfxFloat32x3 colorPrevPreAlpha = LoadPrevPreAlpha(iPrevIdx); + FfxFloat32x3 colorPrevPostAlpha = LoadPrevPostAlpha(iPrevIdx); + +#if USE_YCOCG + colorPreAlpha = RGBToYCoCg(colorPreAlpha); + colorPostAlpha = RGBToYCoCg(colorPostAlpha); + colorPrevPreAlpha = RGBToYCoCg(colorPrevPreAlpha); + colorPrevPostAlpha = RGBToYCoCg(colorPrevPostAlpha); +#endif + FfxFloat32x3 minPrev = FFX_MIN16_F3(+1000.f, +1000.f, +1000.f); + FfxFloat32x3 maxPrev = FFX_MIN16_F3(-1000.f, -1000.f, -1000.f); + for (int y = -1; y < 2; ++y) + { + for (int x = -1; x < 2; ++x) + { + FfxFloat32x3 W = LoadPrevPostAlpha(iPrevIdx + FFX_MIN16_I2(x, y)); + +#if USE_YCOCG + W = RGBToYCoCg(W); +#endif + minPrev = min(minPrev, W); + maxPrev = max(maxPrev, W); + } + } + // instead of computing the overlap: simply count how many samples are outside + // set reactive based on that + FFX_MIN16_F count = FFX_MIN16_F(0.f); + for (int y = -1; y < 2; ++y) + { + for (int x = -1; x < 2; ++x) + { + FfxFloat32x3 Y = LoadInputColor(uDispatchThreadId + FFX_MIN16_I2(x, y)); + +#if USE_YCOCG + Y = RGBToYCoCg(Y); +#endif + count += ((Y.x < minPrev.x) || (Y.x > maxPrev.x)) ? FFX_MIN16_F(1.f) : FFX_MIN16_F(0.f); + count += ((Y.y < minPrev.y) || (Y.y > maxPrev.y)) ? FFX_MIN16_F(1.f) : FFX_MIN16_F(0.f); + count += ((Y.z < minPrev.z) || (Y.z > maxPrev.z)) ? FFX_MIN16_F(1.f) : FFX_MIN16_F(0.f); + } + } + retVal = count / FFX_MIN16_F(27.f); + + return retVal; +} + + +// This function computes the Reactive mask: +// We want pixels marked where the alpha portion of the frame changes a lot between neighbours +// Those pixels are expected to change quickly between frames, too. (e.g. small particles, reflections on curved surfaces...) +// As a result history would not be trustworthy. +// On the other hand we don't want pixels marked where pre-alpha has a large differnce, since those would profit from accumulation +// For mirrors we may assume the pre-alpha is pretty uniform color. +// +// This works well generally, but also marks edge pixels +FFX_MIN16_F ComputeReactive(FFX_MIN16_I2 uDispatchThreadId, FFX_MIN16_I2 iPrevIdx) +{ + // we only get here if alpha has a significant contribution and has changed since last frame. + FFX_MIN16_F retVal = FFX_MIN16_F(0.f); + + // mark pixels with huge variance in alpha as reactive + FFX_MIN16_F alphaEdge = FFX_MIN16_F(computeAlphaEdge(uDispatchThreadId, iPrevIdx)); + FFX_MIN16_F opaqueEdge = FFX_MIN16_F(computeSolidEdge(uDispatchThreadId, iPrevIdx)); + retVal = ffxSaturate(alphaEdge - opaqueEdge); + + // the above also marks edge pixels due to jitter, so we need to cancel those out + + + return retVal; +} diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl new file mode 100644 index 00000000000..12b4b40e08a --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl @@ -0,0 +1,122 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + + +#extension GL_GOOGLE_include_directive : require +#extension GL_EXT_samplerless_texture_functions : require + +#define FSR2_BIND_SRV_INPUT_OPAQUE_ONLY 0 +#define FSR2_BIND_SRV_INPUT_COLOR 1 +#define FSR2_BIND_SRV_INPUT_MOTION_VECTORS 2 +#define FSR2_BIND_SRV_PREV_PRE_ALPHA_COLOR 3 +#define FSR2_BIND_SRV_PREV_POST_ALPHA_COLOR 4 +#define FSR2_BIND_SRV_REACTIVE_MASK 5 +#define FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK 6 + +#define FSR2_BIND_UAV_AUTOREACTIVE 7 +#define FSR2_BIND_UAV_AUTOCOMPOSITION 8 +#define FSR2_BIND_UAV_PREV_PRE_ALPHA_COLOR 9 +#define FSR2_BIND_UAV_PREV_POST_ALPHA_COLOR 10 + +#define FSR2_BIND_CB_FSR2 11 +#define FSR2_BIND_CB_REACTIVE 12 + +// -- GODOT start -- +#if FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS +#define FSR2_BIND_SRV_INPUT_DEPTH 13 +#endif +// -- GODOT end -- + +#include "ffx_fsr2_callbacks_glsl.h" +#include "ffx_fsr2_common.h" + +#ifdef FSR2_BIND_CB_REACTIVE +layout (set = 1, binding = FSR2_BIND_CB_REACTIVE, std140) uniform cbGenerateReactive_t +{ + float fTcThreshold; // 0.1 is a good starting value, lower will result in more TC pixels + float fTcScale; + float fReactiveScale; + float fReactiveMax; +} cbGenerateReactive; + +float getTcThreshold() +{ + return cbGenerateReactive.fTcThreshold; +} + +#else + float getTcThreshold() + { + return 0.05f; + } +#endif + +#include "ffx_fsr2_tcr_autogen.h" + +#ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#define FFX_FSR2_THREAD_GROUP_WIDTH 8 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH +#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT +#define FFX_FSR2_THREAD_GROUP_HEIGHT 8 +#endif // FFX_FSR2_THREAD_GROUP_HEIGHT +#ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#define FFX_FSR2_THREAD_GROUP_DEPTH 1 +#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH +#ifndef FFX_FSR2_NUM_THREADS +#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in; +#endif // #ifndef FFX_FSR2_NUM_THREADS + +FFX_FSR2_NUM_THREADS +void main() +{ + FFX_MIN16_I2 uDispatchThreadId = FFX_MIN16_I2(gl_GlobalInvocationID.xy); + + // ToDo: take into account jitter (i.e. add delta of previous jitter and current jitter to previous UV + // fetch pre- and post-alpha color values + FFX_MIN16_F2 fUv = ( FFX_MIN16_F2(uDispatchThreadId) + FFX_MIN16_F2(0.5f, 0.5f) ) / FFX_MIN16_F2( RenderSize() ); + FFX_MIN16_F2 fPrevUV = fUv + FFX_MIN16_F2( LoadInputMotionVector(uDispatchThreadId) ); + FFX_MIN16_I2 iPrevIdx = FFX_MIN16_I2(fPrevUV * FFX_MIN16_F2(RenderSize()) - 0.5f); + + FFX_MIN16_F3 colorPreAlpha = FFX_MIN16_F3( LoadOpaqueOnly( uDispatchThreadId ) ); + FFX_MIN16_F3 colorPostAlpha = FFX_MIN16_F3( LoadInputColor( uDispatchThreadId ) ); + + FFX_MIN16_F2 outReactiveMask = FFX_MIN16_F2( 0.f, 0.f ); + + outReactiveMask.y = ComputeTransparencyAndComposition(uDispatchThreadId, iPrevIdx); + + if (outReactiveMask.y > 0.5f) + { + outReactiveMask.x = ComputeReactive(uDispatchThreadId, iPrevIdx); + outReactiveMask.x *= FFX_MIN16_F(cbGenerateReactive.fReactiveScale); + outReactiveMask.x = outReactiveMask.x < cbGenerateReactive.fReactiveMax ? outReactiveMask.x : FFX_MIN16_F( cbGenerateReactive.fReactiveMax ); + } + + outReactiveMask.y *= FFX_MIN16_F(cbGenerateReactive.fTcScale); + + outReactiveMask.x = ffxMax(outReactiveMask.x, FFX_MIN16_F(LoadReactiveMask(uDispatchThreadId))); + outReactiveMask.y = ffxMax(outReactiveMask.y, FFX_MIN16_F(LoadTransparencyAndCompositionMask(uDispatchThreadId))); + + StoreAutoReactive(uDispatchThreadId, outReactiveMask); + + StorePrevPreAlpha(uDispatchThreadId, colorPreAlpha); + StorePrevPostAlpha(uDispatchThreadId, colorPostAlpha); +} diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_upsample.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_upsample.h new file mode 100644 index 00000000000..abdb8888a9a --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_upsample.h @@ -0,0 +1,194 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef FFX_FSR2_UPSAMPLE_H +#define FFX_FSR2_UPSAMPLE_H + +FFX_STATIC const FfxUInt32 iLanczos2SampleCount = 16; + +void Deringing(RectificationBox clippingBox, FFX_PARAMETER_INOUT FfxFloat32x3 fColor) +{ + fColor = clamp(fColor, clippingBox.aabbMin, clippingBox.aabbMax); +} +#if FFX_HALF +void Deringing(RectificationBoxMin16 clippingBox, FFX_PARAMETER_INOUT FFX_MIN16_F3 fColor) +{ + fColor = clamp(fColor, clippingBox.aabbMin, clippingBox.aabbMax); +} +#endif + +#ifndef FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE +#define FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE 2 // Approximate +#endif + +FfxFloat32 GetUpsampleLanczosWeight(FfxFloat32x2 fSrcSampleOffset, FfxFloat32 fKernelWeight) +{ + FfxFloat32x2 fSrcSampleOffsetBiased = fSrcSampleOffset * fKernelWeight.xx; +#if FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 0 // LANCZOS_TYPE_REFERENCE + FfxFloat32 fSampleWeight = Lanczos2(length(fSrcSampleOffsetBiased)); +#elif FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 1 // LANCZOS_TYPE_LUT + FfxFloat32 fSampleWeight = Lanczos2_UseLUT(length(fSrcSampleOffsetBiased)); +#elif FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_APPROXIMATE + FfxFloat32 fSampleWeight = Lanczos2ApproxSq(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased)); +#else +#error "Invalid Lanczos type" +#endif + return fSampleWeight; +} + +#if FFX_HALF +FFX_MIN16_F GetUpsampleLanczosWeight(FFX_MIN16_F2 fSrcSampleOffset, FFX_MIN16_F fKernelWeight) +{ + FFX_MIN16_F2 fSrcSampleOffsetBiased = fSrcSampleOffset * fKernelWeight.xx; +#if FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 0 // LANCZOS_TYPE_REFERENCE + FFX_MIN16_F fSampleWeight = Lanczos2(length(fSrcSampleOffsetBiased)); +#elif FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 1 // LANCZOS_TYPE_LUT + FFX_MIN16_F fSampleWeight = Lanczos2_UseLUT(length(fSrcSampleOffsetBiased)); +#elif FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_APPROXIMATE + FFX_MIN16_F fSampleWeight = Lanczos2ApproxSq(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased)); + + // To Test: Save reciproqual sqrt compute + // FfxFloat32 fSampleWeight = Lanczos2Sq_UseLUT(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased)); +#else +#error "Invalid Lanczos type" +#endif + return fSampleWeight; +} +#endif + +FfxFloat32 ComputeMaxKernelWeight() { + const FfxFloat32 fKernelSizeBias = 1.0f; + + FfxFloat32 fKernelWeight = FfxFloat32(1) + (FfxFloat32(1.0f) / FfxFloat32x2(DownscaleFactor()) - FfxFloat32(1)).x * FfxFloat32(fKernelSizeBias); + + return ffxMin(FfxFloat32(1.99f), fKernelWeight); +} + +FfxFloat32x4 ComputeUpsampledColorAndWeight(const AccumulationPassCommonParams params, + FFX_PARAMETER_INOUT RectificationBox clippingBox, FfxFloat32 fReactiveFactor) +{ + #if FFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF && FFX_HALF + #include "ffx_fsr2_force16_begin.h" + #endif + // We compute a sliced lanczos filter with 2 lobes (other slices are accumulated temporaly) + FfxFloat32x2 fDstOutputPos = FfxFloat32x2(params.iPxHrPos) + FFX_BROADCAST_FLOAT32X2(0.5f); // Destination resolution output pixel center position + FfxFloat32x2 fSrcOutputPos = fDstOutputPos * DownscaleFactor(); // Source resolution output pixel center position + FfxInt32x2 iSrcInputPos = FfxInt32x2(floor(fSrcOutputPos)); // TODO: what about weird upscale factors... + + #if FFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF && FFX_HALF + #include "ffx_fsr2_force16_end.h" + #endif + + FfxFloat32x3 fSamples[iLanczos2SampleCount]; + + FfxFloat32x2 fSrcUnjitteredPos = (FfxFloat32x2(iSrcInputPos) + FfxFloat32x2(0.5f, 0.5f)) - Jitter(); // This is the un-jittered position of the sample at offset 0,0 + + FfxInt32x2 offsetTL; + offsetTL.x = (fSrcUnjitteredPos.x > fSrcOutputPos.x) ? FfxInt32(-2) : FfxInt32(-1); + offsetTL.y = (fSrcUnjitteredPos.y > fSrcOutputPos.y) ? FfxInt32(-2) : FfxInt32(-1); + + //Load samples + // If fSrcUnjitteredPos.y > fSrcOutputPos.y, indicates offsetTL.y = -2, sample offset Y will be [-2, 1], clipbox will be rows [1, 3]. + // Flip row# for sampling offset in this case, so first 0~2 rows in the sampled array can always be used for computing the clipbox. + // This reduces branch or cmove on sampled colors, but moving this overhead to sample position / weight calculation time which apply to less values. + const FfxBoolean bFlipRow = fSrcUnjitteredPos.y > fSrcOutputPos.y; + const FfxBoolean bFlipCol = fSrcUnjitteredPos.x > fSrcOutputPos.x; + + FfxFloat32x2 fOffsetTL = FfxFloat32x2(offsetTL); + + FFX_UNROLL + for (FfxInt32 row = 0; row < 3; row++) { + + FFX_UNROLL + for (FfxInt32 col = 0; col < 3; col++) { + FfxInt32 iSampleIndex = col + (row << 2); + + FfxInt32x2 sampleColRow = FfxInt32x2(bFlipCol ? (3 - col) : col, bFlipRow ? (3 - row) : row); + FfxInt32x2 iSrcSamplePos = FfxInt32x2(iSrcInputPos) + offsetTL + sampleColRow; + + const FfxInt32x2 sampleCoord = ClampLoad(iSrcSamplePos, FfxInt32x2(0, 0), FfxInt32x2(RenderSize())); + + fSamples[iSampleIndex] = LoadPreparedInputColor(FfxInt32x2(sampleCoord)); + } + } + + FfxFloat32x4 fColorAndWeight = FfxFloat32x4(0.0f, 0.0f, 0.0f, 0.0f); + + FfxFloat32x2 fBaseSampleOffset = FfxFloat32x2(fSrcUnjitteredPos - fSrcOutputPos); + + // Identify how much of each upsampled color to be used for this frame + const FfxFloat32 fKernelReactiveFactor = ffxMax(fReactiveFactor, FfxFloat32(params.bIsNewSample)); + const FfxFloat32 fKernelBiasMax = ComputeMaxKernelWeight() * (1.0f - fKernelReactiveFactor); + + const FfxFloat32 fKernelBiasMin = ffxMax(1.0f, ((1.0f + fKernelBiasMax) * 0.3f)); + const FfxFloat32 fKernelBiasFactor = ffxMax(0.0f, ffxMax(0.25f * params.fDepthClipFactor, fKernelReactiveFactor)); + const FfxFloat32 fKernelBias = ffxLerp(fKernelBiasMax, fKernelBiasMin, fKernelBiasFactor); + + const FfxFloat32 fRectificationCurveBias = ffxLerp(-2.0f, -3.0f, ffxSaturate(params.fHrVelocity / 50.0f)); + + FFX_UNROLL + for (FfxInt32 row = 0; row < 3; row++) { + FFX_UNROLL + for (FfxInt32 col = 0; col < 3; col++) { + FfxInt32 iSampleIndex = col + (row << 2); + + const FfxInt32x2 sampleColRow = FfxInt32x2(bFlipCol ? (3 - col) : col, bFlipRow ? (3 - row) : row); + const FfxFloat32x2 fOffset = fOffsetTL + FfxFloat32x2(sampleColRow); + FfxFloat32x2 fSrcSampleOffset = fBaseSampleOffset + fOffset; + + FfxInt32x2 iSrcSamplePos = FfxInt32x2(iSrcInputPos) + FfxInt32x2(offsetTL) + sampleColRow; + + const FfxFloat32 fOnScreenFactor = FfxFloat32(IsOnScreen(FfxInt32x2(iSrcSamplePos), FfxInt32x2(RenderSize()))); + FfxFloat32 fSampleWeight = fOnScreenFactor * FfxFloat32(GetUpsampleLanczosWeight(fSrcSampleOffset, fKernelBias)); + + fColorAndWeight += FfxFloat32x4(fSamples[iSampleIndex] * fSampleWeight, fSampleWeight); + + // Update rectification box + { + const FfxFloat32 fSrcSampleOffsetSq = dot(fSrcSampleOffset, fSrcSampleOffset); + const FfxFloat32 fBoxSampleWeight = exp(fRectificationCurveBias * fSrcSampleOffsetSq); + + const FfxBoolean bInitialSample = (row == 0) && (col == 0); + RectificationBoxAddSample(bInitialSample, clippingBox, fSamples[iSampleIndex], fBoxSampleWeight); + } + } + } + + RectificationBoxComputeVarianceBoxData(clippingBox); + + fColorAndWeight.w *= FfxFloat32(fColorAndWeight.w > FSR2_EPSILON); + + if (fColorAndWeight.w > FSR2_EPSILON) { + // Normalize for deringing (we need to compare colors) + fColorAndWeight.xyz = fColorAndWeight.xyz / fColorAndWeight.w; + fColorAndWeight.w *= fUpsampleLanczosWeightScale; + + Deringing(clippingBox, fColorAndWeight.xyz); + } + + #if FFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF && FFX_HALF + #include "ffx_fsr2_force16_end.h" + #endif + + return fColorAndWeight; +} + +#endif //!defined( FFX_FSR2_UPSAMPLE_H ) diff --git a/thirdparty/amd-fsr2/shaders/ffx_spd.h b/thirdparty/amd-fsr2/shaders/ffx_spd.h new file mode 100644 index 00000000000..5ce24ec87cc --- /dev/null +++ b/thirdparty/amd-fsr2/shaders/ffx_spd.h @@ -0,0 +1,936 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifdef FFX_CPU +FFX_STATIC void SpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy + FfxUInt32x2 workGroupOffset, // GPU side: pass in as constant + FfxUInt32x2 numWorkGroupsAndMips, // GPU side: pass in as constant + FfxUInt32x4 rectInfo, // left, top, width, height + FfxInt32 mips) // optional: if -1, calculate based on rect width and height +{ + workGroupOffset[0] = rectInfo[0] / 64; // rectInfo[0] = left + workGroupOffset[1] = rectInfo[1] / 64; // rectInfo[1] = top + + FfxUInt32 endIndexX = (rectInfo[0] + rectInfo[2] - 1) / 64; // rectInfo[0] = left, rectInfo[2] = width + FfxUInt32 endIndexY = (rectInfo[1] + rectInfo[3] - 1) / 64; // rectInfo[1] = top, rectInfo[3] = height + + dispatchThreadGroupCountXY[0] = endIndexX + 1 - workGroupOffset[0]; + dispatchThreadGroupCountXY[1] = endIndexY + 1 - workGroupOffset[1]; + + numWorkGroupsAndMips[0] = (dispatchThreadGroupCountXY[0]) * (dispatchThreadGroupCountXY[1]); + + if (mips >= 0) + { + numWorkGroupsAndMips[1] = FfxUInt32(mips); + } + else + { + // calculate based on rect width and height + FfxUInt32 resolution = ffxMax(rectInfo[2], rectInfo[3]); + numWorkGroupsAndMips[1] = FfxUInt32((ffxMin(floor(log2(FfxFloat32(resolution))), FfxFloat32(12)))); + } +} + +FFX_STATIC void SpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy + FfxUInt32x2 workGroupOffset, // GPU side: pass in as constant + FfxUInt32x2 numWorkGroupsAndMips, // GPU side: pass in as constant + FfxUInt32x4 rectInfo) // left, top, width, height +{ + SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, -1); +} +#endif // #ifdef FFX_CPU + + +//============================================================================================================================== +// NON-PACKED VERSION +//============================================================================================================================== +#ifdef FFX_GPU +#ifdef SPD_PACKED_ONLY +// Avoid compiler error +FfxFloat32x4 SpdLoadSourceImage(FfxInt32x2 p, FfxUInt32 slice) +{ + return FfxFloat32x4(0.0, 0.0, 0.0, 0.0); +} + +FfxFloat32x4 SpdLoad(FfxInt32x2 p, FfxUInt32 slice) +{ + return FfxFloat32x4(0.0, 0.0, 0.0, 0.0); +} +void SpdStore(FfxInt32x2 p, FfxFloat32x4 value, FfxUInt32 mip, FfxUInt32 slice) +{ +} +FfxFloat32x4 SpdLoadIntermediate(FfxUInt32 x, FfxUInt32 y) +{ + return FfxFloat32x4(0.0, 0.0, 0.0, 0.0); +} +void SpdStoreIntermediate(FfxUInt32 x, FfxUInt32 y, FfxFloat32x4 value) +{ +} +FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3) +{ + return FfxFloat32x4(0.0, 0.0, 0.0, 0.0); +} +#endif // #ifdef SPD_PACKED_ONLY + +//_____________________________________________________________/\_______________________________________________________________ +#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS) +#extension GL_KHR_shader_subgroup_quad:require +#endif + +void SpdWorkgroupShuffleBarrier() +{ +#ifdef FFX_GLSL + barrier(); +#endif +#ifdef FFX_HLSL + GroupMemoryBarrierWithGroupSync(); +#endif +} + +// Only last active workgroup should proceed +bool SpdExitWorkgroup(FfxUInt32 numWorkGroups, FfxUInt32 localInvocationIndex, FfxUInt32 slice) +{ + // global atomic counter + if (localInvocationIndex == 0) + { + SpdIncreaseAtomicCounter(slice); + } + + SpdWorkgroupShuffleBarrier(); + return (SpdGetAtomicCounter() != (numWorkGroups - 1)); +} + +// User defined: FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3); +FfxFloat32x4 SpdReduceQuad(FfxFloat32x4 v) +{ +#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS) + + FfxFloat32x4 v0 = v; + FfxFloat32x4 v1 = subgroupQuadSwapHorizontal(v); + FfxFloat32x4 v2 = subgroupQuadSwapVertical(v); + FfxFloat32x4 v3 = subgroupQuadSwapDiagonal(v); + return SpdReduce4(v0, v1, v2, v3); + +#elif defined(FFX_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS) + + // requires SM6.0 + FfxUInt32 quad = WaveGetLaneIndex() & (~0x3); + FfxFloat32x4 v0 = v; + FfxFloat32x4 v1 = WaveReadLaneAt(v, quad | 1); + FfxFloat32x4 v2 = WaveReadLaneAt(v, quad | 2); + FfxFloat32x4 v3 = WaveReadLaneAt(v, quad | 3); + return SpdReduce4(v0, v1, v2, v3); +/* + // if SM6.0 is not available, you can use the AMD shader intrinsics + // the AMD shader intrinsics are available in AMD GPU Services (AGS) library: + // https://gpuopen.com/amd-gpu-services-ags-library/ + // works for DX11 + FfxFloat32x4 v0 = v; + FfxFloat32x4 v1; + v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + FfxFloat32x4 v2; + v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + FfxFloat32x4 v3; + v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + return SpdReduce4(v0, v1, v2, v3); + */ +#endif + return v; +} + +FfxFloat32x4 SpdReduceIntermediate(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3) +{ + FfxFloat32x4 v0 = SpdLoadIntermediate(i0.x, i0.y); + FfxFloat32x4 v1 = SpdLoadIntermediate(i1.x, i1.y); + FfxFloat32x4 v2 = SpdLoadIntermediate(i2.x, i2.y); + FfxFloat32x4 v3 = SpdLoadIntermediate(i3.x, i3.y); + return SpdReduce4(v0, v1, v2, v3); +} + +FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice) +{ + FfxFloat32x4 v0 = SpdLoad(FfxInt32x2(i0), slice); + FfxFloat32x4 v1 = SpdLoad(FfxInt32x2(i1), slice); + FfxFloat32x4 v2 = SpdLoad(FfxInt32x2(i2), slice); + FfxFloat32x4 v3 = SpdLoad(FfxInt32x2(i3), slice); + return SpdReduce4(v0, v1, v2, v3); +} + +FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 base, FfxUInt32 slice) +{ + return SpdReduceLoad4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice); +} + +FfxFloat32x4 SpdReduceLoadSourceImage4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice) +{ + FfxFloat32x4 v0 = SpdLoadSourceImage(FfxInt32x2(i0), slice); + FfxFloat32x4 v1 = SpdLoadSourceImage(FfxInt32x2(i1), slice); + FfxFloat32x4 v2 = SpdLoadSourceImage(FfxInt32x2(i2), slice); + FfxFloat32x4 v3 = SpdLoadSourceImage(FfxInt32x2(i3), slice); + return SpdReduce4(v0, v1, v2, v3); +} + +FfxFloat32x4 SpdReduceLoadSourceImage(FfxUInt32x2 base, FfxUInt32 slice) +{ +#ifdef SPD_LINEAR_SAMPLER + return SpdLoadSourceImage(FfxInt32x2(base), slice); +#else + return SpdReduceLoadSourceImage4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice); +#endif +} + +void SpdDownsampleMips_0_1_Intrinsics(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ + FfxFloat32x4 v[4]; + + FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2); + FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y); + v[0] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[0], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y); + v[1] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[1], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16); + v[2] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[2], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[3], 0, slice); + + if (mip <= 1) + return; + + v[0] = SpdReduceQuad(v[0]); + v[1] = SpdReduceQuad(v[1]); + v[2] = SpdReduceQuad(v[2]); + v[3] = SpdReduceQuad(v[3]); + + if ((localInvocationIndex % 4) == 0) + { + SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice); + SpdStoreIntermediate(x / 2, y / 2, v[0]); + + SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice); + SpdStoreIntermediate(x / 2 + 8, y / 2, v[1]); + + SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice); + SpdStoreIntermediate(x / 2, y / 2 + 8, v[2]); + + SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice); + SpdStoreIntermediate(x / 2 + 8, y / 2 + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1_LDS(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ + FfxFloat32x4 v[4]; + + FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2); + FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y); + v[0] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[0], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y); + v[1] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[1], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16); + v[2] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[2], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[3], 0, slice); + + if (mip <= 1) + return; + + for (FfxUInt32 i = 0; i < 4; i++) + { + SpdStoreIntermediate(x, y, v[i]); + SpdWorkgroupShuffleBarrier(); + if (localInvocationIndex < 64) + { + v[i] = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1)); + SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice); + } + SpdWorkgroupShuffleBarrier(); + } + + if (localInvocationIndex < 64) + { + SpdStoreIntermediate(x + 0, y + 0, v[0]); + SpdStoreIntermediate(x + 8, y + 0, v[1]); + SpdStoreIntermediate(x + 0, y + 8, v[2]); + SpdStoreIntermediate(x + 8, y + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip, slice); +#else + SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip, slice); +#endif +} + + +void SpdDownsampleMip_2(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 64) + { + FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1)); + SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice); + // store to LDS, try to reduce bank conflicts + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // ... + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + SpdStoreIntermediate(x * 2 + y % 2, y * 2, v); + } +#else + FfxFloat32x4 v = SpdLoadIntermediate(x, y); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice); + SpdStoreIntermediate(x + (y / 2) % 2, y, v); + } +#endif +} + +void SpdDownsampleMip_3(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 16) + { + // x 0 x 0 + // 0 0 0 0 + // 0 x 0 x + // 0 0 0 0 + FfxFloat32x4 v = + SpdReduceIntermediate(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2)); + SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice); + // store to LDS + // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 + // ... + // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 + // ... + // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x + // ... + SpdStoreIntermediate(x * 4 + y, y * 4, v); + } +#else + if (localInvocationIndex < 64) + { + FfxFloat32x4 v = SpdLoadIntermediate(x * 2 + y % 2, y * 2); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice); + SpdStoreIntermediate(x * 2 + y / 2, y * 2, v); + } + } +#endif +} + +void SpdDownsampleMip_4(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 4) + { + // x 0 0 0 x 0 0 0 + // ... + // 0 x 0 0 0 x 0 0 + FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0), + FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0), + FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4), + FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4)); + SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice); + // store to LDS + // x x x x 0 ... + // 0 ... + SpdStoreIntermediate(x + y * 2, 0, v); + } +#else + if (localInvocationIndex < 16) + { + FfxFloat32x4 v = SpdLoadIntermediate(x * 4 + y, y * 4); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice); + SpdStoreIntermediate(x / 2 + y, 0, v); + } + } +#endif +} + +void SpdDownsampleMip_5(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 1) + { + // x x x x 0 ... + // 0 ... + FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0)); + SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice); + } +#else + if (localInvocationIndex < 4) + { + FfxFloat32x4 v = SpdLoadIntermediate(localInvocationIndex, 0); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice); + } + } +#endif +} + +void SpdDownsampleMips_6_7(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice) +{ + FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0); + FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0); + FfxFloat32x4 v0 = SpdReduceLoad4(tex, slice); + SpdStore(pix, v0, 6, slice); + + tex = FfxInt32x2(x * 4 + 2, y * 4 + 0); + pix = FfxInt32x2(x * 2 + 1, y * 2 + 0); + FfxFloat32x4 v1 = SpdReduceLoad4(tex, slice); + SpdStore(pix, v1, 6, slice); + + tex = FfxInt32x2(x * 4 + 0, y * 4 + 2); + pix = FfxInt32x2(x * 2 + 0, y * 2 + 1); + FfxFloat32x4 v2 = SpdReduceLoad4(tex, slice); + SpdStore(pix, v2, 6, slice); + + tex = FfxInt32x2(x * 4 + 2, y * 4 + 2); + pix = FfxInt32x2(x * 2 + 1, y * 2 + 1); + FfxFloat32x4 v3 = SpdReduceLoad4(tex, slice); + SpdStore(pix, v3, 6, slice); + + if (mips <= 7) + return; + // no barrier needed, working on values only from the same thread + + FfxFloat32x4 v = SpdReduce4(v0, v1, v2, v3); + SpdStore(FfxInt32x2(x, y), v, 7, slice); + SpdStoreIntermediate(x, y, v); +} + +void SpdDownsampleNextFour(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice) +{ + if (mips <= baseMip) + return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip, slice); + + if (mips <= baseMip + 1) + return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice); + + if (mips <= baseMip + 2) + return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice); + + if (mips <= baseMip + 3) + return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_5(workGroupID, localInvocationIndex, baseMip + 3, slice); +} + +void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice) +{ + FfxUInt32x2 sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64); + FfxUInt32 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2); + FfxUInt32 y = sub_xy.y + 8 * ((localInvocationIndex >> 7)); + SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips, slice); + + SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2, mips, slice); + + if (mips <= 6) + return; + + if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice)) + return; + + SpdResetAtomicCounter(slice); + + // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels. + SpdDownsampleMips_6_7(x, y, mips, slice); + + SpdDownsampleNextFour(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice); +} + +void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset) +{ + SpdDownsample(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +//============================================================================================================================== +// PACKED VERSION +//============================================================================================================================== + +#if FFX_HALF + +#ifdef FFX_GLSL +#extension GL_EXT_shader_subgroup_extended_types_float16:require +#endif + +FfxFloat16x4 SpdReduceQuadH(FfxFloat16x4 v) +{ +#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS) + FfxFloat16x4 v0 = v; + FfxFloat16x4 v1 = subgroupQuadSwapHorizontal(v); + FfxFloat16x4 v2 = subgroupQuadSwapVertical(v); + FfxFloat16x4 v3 = subgroupQuadSwapDiagonal(v); + return SpdReduce4H(v0, v1, v2, v3); +#elif defined(FFX_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS) + // requires SM6.0 + FfxUInt32 quad = WaveGetLaneIndex() & (~0x3); + FfxFloat16x4 v0 = v; + FfxFloat16x4 v1 = WaveReadLaneAt(v, quad | 1); + FfxFloat16x4 v2 = WaveReadLaneAt(v, quad | 2); + FfxFloat16x4 v3 = WaveReadLaneAt(v, quad | 3); + return SpdReduce4H(v0, v1, v2, v3); +/* + // if SM6.0 is not available, you can use the AMD shader intrinsics + // the AMD shader intrinsics are available in AMD GPU Services (AGS) library: + // https://gpuopen.com/amd-gpu-services-ags-library/ + // works for DX11 + FfxFloat16x4 v0 = v; + FfxFloat16x4 v1; + v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + FfxFloat16x4 v2; + v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + FfxFloat16x4 v3; + v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + return SpdReduce4H(v0, v1, v2, v3); + */ +#endif + return FfxFloat16x4(0.0, 0.0, 0.0, 0.0); +} + +FfxFloat16x4 SpdReduceIntermediateH(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3) +{ + FfxFloat16x4 v0 = SpdLoadIntermediateH(i0.x, i0.y); + FfxFloat16x4 v1 = SpdLoadIntermediateH(i1.x, i1.y); + FfxFloat16x4 v2 = SpdLoadIntermediateH(i2.x, i2.y); + FfxFloat16x4 v3 = SpdLoadIntermediateH(i3.x, i3.y); + return SpdReduce4H(v0, v1, v2, v3); +} + +FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice) +{ + FfxFloat16x4 v0 = SpdLoadH(FfxInt32x2(i0), slice); + FfxFloat16x4 v1 = SpdLoadH(FfxInt32x2(i1), slice); + FfxFloat16x4 v2 = SpdLoadH(FfxInt32x2(i2), slice); + FfxFloat16x4 v3 = SpdLoadH(FfxInt32x2(i3), slice); + return SpdReduce4H(v0, v1, v2, v3); +} + +FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 base, FfxUInt32 slice) +{ + return SpdReduceLoad4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice); +} + +FfxFloat16x4 SpdReduceLoadSourceImage4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice) +{ + FfxFloat16x4 v0 = SpdLoadSourceImageH(FfxInt32x2(i0), slice); + FfxFloat16x4 v1 = SpdLoadSourceImageH(FfxInt32x2(i1), slice); + FfxFloat16x4 v2 = SpdLoadSourceImageH(FfxInt32x2(i2), slice); + FfxFloat16x4 v3 = SpdLoadSourceImageH(FfxInt32x2(i3), slice); + return SpdReduce4H(v0, v1, v2, v3); +} + +FfxFloat16x4 SpdReduceLoadSourceImageH(FfxUInt32x2 base, FfxUInt32 slice) +{ +#ifdef SPD_LINEAR_SAMPLER + return SpdLoadSourceImageH(FfxInt32x2(base), slice); +#else + return SpdReduceLoadSourceImage4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice); +#endif +} + +void SpdDownsampleMips_0_1_IntrinsicsH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice) +{ + FfxFloat16x4 v[4]; + + FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2); + FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y); + v[0] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[0], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y); + v[1] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[1], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16); + v[2] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[2], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[3], 0, slice); + + if (mips <= 1) + return; + + v[0] = SpdReduceQuadH(v[0]); + v[1] = SpdReduceQuadH(v[1]); + v[2] = SpdReduceQuadH(v[2]); + v[3] = SpdReduceQuadH(v[3]); + + if ((localInvocationIndex % 4) == 0) + { + SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice); + SpdStoreIntermediateH(x / 2, y / 2, v[0]); + + SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice); + SpdStoreIntermediateH(x / 2 + 8, y / 2, v[1]); + + SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice); + SpdStoreIntermediateH(x / 2, y / 2 + 8, v[2]); + + SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice); + SpdStoreIntermediateH(x / 2 + 8, y / 2 + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1_LDSH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice) +{ + FfxFloat16x4 v[4]; + + FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2); + FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y); + v[0] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[0], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y); + v[1] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[1], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16); + v[2] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[2], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[3], 0, slice); + + if (mips <= 1) + return; + + for (FfxInt32 i = 0; i < 4; i++) + { + SpdStoreIntermediateH(x, y, v[i]); + SpdWorkgroupShuffleBarrier(); + if (localInvocationIndex < 64) + { + v[i] = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1)); + SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice); + } + SpdWorkgroupShuffleBarrier(); + } + + if (localInvocationIndex < 64) + { + SpdStoreIntermediateH(x + 0, y + 0, v[0]); + SpdStoreIntermediateH(x + 8, y + 0, v[1]); + SpdStoreIntermediateH(x + 0, y + 8, v[2]); + SpdStoreIntermediateH(x + 8, y + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips, slice); +#else + SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips, slice); +#endif +} + + +void SpdDownsampleMip_2H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 64) + { + FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1)); + SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice); + // store to LDS, try to reduce bank conflicts + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // ... + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v); + } +#else + FfxFloat16x4 v = SpdLoadIntermediateH(x, y); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice); + SpdStoreIntermediateH(x + (y / 2) % 2, y, v); + } +#endif +} + +void SpdDownsampleMip_3H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 16) + { + // x 0 x 0 + // 0 0 0 0 + // 0 x 0 x + // 0 0 0 0 + FfxFloat16x4 v = + SpdReduceIntermediateH(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2)); + SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice); + // store to LDS + // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 + // ... + // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 + // ... + // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x + // ... + SpdStoreIntermediateH(x * 4 + y, y * 4, v); + } +#else + if (localInvocationIndex < 64) + { + FfxFloat16x4 v = SpdLoadIntermediateH(x * 2 + y % 2, y * 2); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice); + SpdStoreIntermediateH(x * 2 + y / 2, y * 2, v); + } + } +#endif +} + +void SpdDownsampleMip_4H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 4) + { + // x 0 0 0 x 0 0 0 + // ... + // 0 x 0 0 0 x 0 0 + FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0), + FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0), + FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4), + FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4)); + SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice); + // store to LDS + // x x x x 0 ... + // 0 ... + SpdStoreIntermediateH(x + y * 2, 0, v); + } +#else + if (localInvocationIndex < 16) + { + FfxFloat16x4 v = SpdLoadIntermediateH(x * 4 + y, y * 4); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice); + SpdStoreIntermediateH(x / 2 + y, 0, v); + } + } +#endif +} + +void SpdDownsampleMip_5H(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 1) + { + // x x x x 0 ... + // 0 ... + FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0)); + SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice); + } +#else + if (localInvocationIndex < 4) + { + FfxFloat16x4 v = SpdLoadIntermediateH(localInvocationIndex, 0); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice); + } + } +#endif +} + +void SpdDownsampleMips_6_7H(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice) +{ + FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0); + FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0); + FfxFloat16x4 v0 = SpdReduceLoad4H(tex, slice); + SpdStoreH(pix, v0, 6, slice); + + tex = FfxInt32x2(x * 4 + 2, y * 4 + 0); + pix = FfxInt32x2(x * 2 + 1, y * 2 + 0); + FfxFloat16x4 v1 = SpdReduceLoad4H(tex, slice); + SpdStoreH(pix, v1, 6, slice); + + tex = FfxInt32x2(x * 4 + 0, y * 4 + 2); + pix = FfxInt32x2(x * 2 + 0, y * 2 + 1); + FfxFloat16x4 v2 = SpdReduceLoad4H(tex, slice); + SpdStoreH(pix, v2, 6, slice); + + tex = FfxInt32x2(x * 4 + 2, y * 4 + 2); + pix = FfxInt32x2(x * 2 + 1, y * 2 + 1); + FfxFloat16x4 v3 = SpdReduceLoad4H(tex, slice); + SpdStoreH(pix, v3, 6, slice); + + if (mips < 8) + return; + // no barrier needed, working on values only from the same thread + + FfxFloat16x4 v = SpdReduce4H(v0, v1, v2, v3); + SpdStoreH(FfxInt32x2(x, y), v, 7, slice); + SpdStoreIntermediateH(x, y, v); +} + +void SpdDownsampleNextFourH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice) +{ + if (mips <= baseMip) + return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip, slice); + + if (mips <= baseMip + 1) + return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice); + + if (mips <= baseMip + 2) + return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice); + + if (mips <= baseMip + 3) + return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_5H(workGroupID, localInvocationIndex, baseMip + 3, slice); +} + +void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice) +{ + FfxUInt32x2 sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64); + FfxUInt32 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2); + FfxUInt32 y = sub_xy.y + 8 * ((localInvocationIndex >> 7)); + + SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips, slice); + + SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips, slice); + + if (mips < 7) + return; + + if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice)) + return; + + SpdResetAtomicCounter(slice); + + // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels. + SpdDownsampleMips_6_7H(x, y, mips, slice); + + SpdDownsampleNextFourH(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice); +} + +void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset) +{ + SpdDownsampleH(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice); +} + +#endif // #if FFX_HALF +#endif // #ifdef FFX_GPU