51 lines
1.3 KiB
C++
51 lines
1.3 KiB
C++
// Copyright 2009-2020 Intel Corporation
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
|
|
#pragma once
|
|
|
|
/* Make precision match SSE, at the cost of some performance */
|
|
#if !defined(__aarch64__)
|
|
# define SSE2NEON_PRECISE_DIV 1
|
|
# define SSE2NEON_PRECISE_SQRT 1
|
|
#endif
|
|
|
|
#include "sse2neon.h"
|
|
|
|
__forceinline __m128 _mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
|
|
__m128 neg_c = vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(c)));
|
|
return _mm_fmadd_ps(a, b, neg_c);
|
|
}
|
|
|
|
__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
|
|
#if defined(__aarch64__)
|
|
return vreinterpretq_m128_f32(vfmsq_f32(vreinterpretq_f32_m128(c),
|
|
vreinterpretq_f32_m128(b),
|
|
vreinterpretq_f32_m128(a)));
|
|
#else
|
|
return _mm_sub_ps(c, _mm_mul_ps(a, b));
|
|
#endif
|
|
}
|
|
|
|
__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
|
|
return vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(_mm_fmadd_ps(a,b,c))));
|
|
}
|
|
|
|
|
|
/* Dummy defines for floating point control */
|
|
#define _MM_MASK_MASK 0x1f80
|
|
#define _MM_MASK_DIV_ZERO 0x200
|
|
#define _MM_FLUSH_ZERO_ON 0x8000
|
|
#define _MM_MASK_DENORM 0x100
|
|
#define _MM_SET_EXCEPTION_MASK(x)
|
|
#define _MM_SET_FLUSH_ZERO_MODE(x)
|
|
|
|
__forceinline int _mm_getcsr()
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
__forceinline void _mm_mfence()
|
|
{
|
|
__sync_synchronize();
|
|
}
|