From 2db412a923736ec995ef829cb6d62b2df98b8f1e Mon Sep 17 00:00:00 2001 From: Erik Hofman Date: Thu, 19 Jan 2017 16:39:30 +0100 Subject: [PATCH] Specialize class constructors in the hope to ged rid of wrong compiler assignment of an unaligned float array to an SSE register --- simgear/math/simd.hxx | 264 +++++++++++++++++++++++------------------- 1 file changed, 142 insertions(+), 122 deletions(-) diff --git a/simgear/math/simd.hxx b/simgear/math/simd.hxx index 27829cc6..0ec604e8 100644 --- a/simgear/math/simd.hxx +++ b/simgear/math/simd.hxx @@ -18,6 +18,7 @@ #ifndef __SIMD_H__ #define __SIMD_H__ 1 +#include #include #include @@ -309,6 +310,11 @@ inline simd4_t operator*(simd4_t v, T f) { # define ALIGN16C __attribute__((aligned(16))) # define ALIGN32C __attribute__((aligned(32))) # endif + +static const uint32_t m2a32[] = { 0xffffffff,0xffffffff,0,0 }; +static const uint32_t m3a32[] = { 0xffffffff,0xffffffff,0xffffffff,0 }; +static const uint64_t m2a64[] = { 0xffffffffffffffff,0xffffffffffffffff,0,0 }; +static const uint64_t m3a64[] = { 0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff,0 }; # endif # ifdef __SSE__ @@ -341,25 +347,19 @@ private: union ALIGN16 { __m128 simd4; __vec4f_t vec; - float _v4[4]; } ALIGN16C; public: simd4_t(void) {} - simd4_t(float f) { - simd4 = _mm_set1_ps(f); - for (int i=N; i<4; ++i) _v4[i] = 0.0f; - } + simd4_t(float f) {} simd4_t(float x, float y) : simd4_t(x,y,0,0) {} simd4_t(float x, float y, float z) : simd4_t(x,y,z,0) {} simd4_t(float x, float y, float z, float w) { simd4 = _mm_set_ps(w,z,y,x); } - simd4_t(const __vec4f_t v) { - simd4 = _mm_loadu_ps(v); - for (int i=N; i<4; ++i) _v4[i] = 0.0f; - } - simd4_t(const simd4_t& v) { + simd4_t(const __vec4f_t v) {} + template + simd4_t(const simd4_t& v) { simd4 = v.v4(); } simd4_t(const __m128& v) { @@ -387,25 +387,6 @@ public: return vec; } - inline simd4_t& operator=(float f) { - simd4 = _mm_set1_ps(f); - for (int i=N; i<4; ++i) _v4[i] = 0.0f; - return *this; - } - inline simd4_t& operator=(const __vec4f_t v) { - simd4 = _mm_loadu_ps(v); - for (int i=N; i<4; ++i) _v4[i] = 0.0f; - return *this; - } - inline simd4_t& operator=(const simd4_t& v) { - simd4 = v.v4(); - return *this; - } - inline simd4_t& operator=(const __m128& v) { - simd4 = v; - return *this; - } - inline simd4_t& operator+=(float f) { return operator+=(simd4_t(f)); } @@ -443,6 +424,35 @@ public: } }; +static const __m128 fmask2 = _mm_load_ps((const float*)m2a32); +static const __m128 fmask3 = _mm_load_ps((const float*)m3a32); + +template<> +inline simd4_t::simd4_t(const __vec4f_t v) { + simd4 = _mm_loadu_ps(v); +} +template<> +inline simd4_t::simd4_t(const __vec4f_t v) { + simd4 = _mm_and_ps(_mm_loadu_ps(v), fmask3); +} +template<> +inline simd4_t::simd4_t(const __vec4f_t v) { + simd4 = _mm_and_ps(_mm_loadu_ps(v), fmask2); +} +template<> +inline simd4_t::simd4_t(float f) { + simd4 = _mm_set1_ps(f); +} +template<> +inline simd4_t::simd4_t(float f) { + simd4 = _mm_and_ps(_mm_set1_ps(f), fmask3); +} + +template<> +inline simd4_t::simd4_t(float f) { + simd4 = _mm_and_ps(_mm_set1_ps(f), fmask2); +} + namespace simd4 { // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86 @@ -548,25 +558,19 @@ private: union ALIGN32 { __m256d simd4; __vec4d_t vec; - double _v4[4]; } ALIGN32C; public: simd4_t(void) {} - simd4_t(double d) { - simd4 = _mm256_set1_pd(d); - for (int i=N; i<4; ++i) _v4[i] = 0.0; - } + simd4_t(double d) {} simd4_t(double x, double y) : simd4_t(x,y,0,0) {} simd4_t(double x, double y, double z) : simd4_t(x,y,z,0) {} simd4_t(double x, double y, double z, double w) { simd4 = _mm256_set_pd(w,z,y,x); } - explicit simd4_t(const __vec4d_t v) { - simd4 = _mm256_loadu_pd(v); - for (int i=N; i<4; ++i) _v4[i] = 0.0; - } - simd4_t(const simd4_t& v) { + explicit simd4_t(const __vec4d_t v) {} + template + simd4_t(const simd4_t& v) { simd4 = v.v4(); } simd4_t(const __m256d& v) { @@ -594,25 +598,6 @@ public: return vec; } - inline simd4_t& operator=(double d) { - simd4 = _mm256_set1_pd(d); - for (int i=N; i<4; ++i) _v4[i] = 0.0; - return *this; - } - inline simd4_t& operator=(const __vec4d_t v) { - simd4 = _mm256_loadu_pd(v); - for (int i=N; i<4; ++i) _v4[i] = 0.0; - return *this; - } - inline simd4_t& operator=(const simd4_t& v) { - simd4 = v.v4(); - return *this; - } - inline simd4_t& operator=(const __m256d& v) { - simd4 = v; - return *this; - } - inline simd4_t& operator+=(double d) { return operator+=(simd4_t(d)); } @@ -650,6 +635,34 @@ public: } }; +static const __m256d dmask2 = _mm256_load_pd((const double*)m2a64); +static const __m256d dmask3 = _mm256_load_pd((const double*)m3a64); + +template<> +inline simd4_t::simd4_t(const __vec4d_t v) { + simd4 = _mm256_loadu_pd(v); +} +template<> +inline simd4_t::simd4_t(const __vec4d_t v) { + simd4 = _mm256_and_pd(_mm256_loadu_pd(v), dmask3); +} +template<> +inline simd4_t::simd4_t(const __vec4d_t v) { + simd4 = _mm256_and_pd(_mm256_loadu_pd(v), dmask2); +} +template<> +inline simd4_t::simd4_t(double d) { + simd4 = _mm256_set1_pd(d); +} +template<> +inline simd4_t::simd4_t(double d) { + simd4 = _mm256_and_pd(_mm256_set1_pd(d), dmask3); +} +template<> +inline simd4_t::simd4_t(double d) { + simd4 = _mm256_and_pd(_mm256_set1_pd(d), dmask2); +} + namespace simd4 { // http://berenger.eu/blog/sseavxsimd-horizontal-sum-sum-simd-vector-intrinsic/ @@ -721,27 +734,20 @@ private: union ALIGN16 { __m128d simd4[2]; __vec4d_t vec; - double _v4[4]; } ALIGN16C; public: simd4_t(void) {} - simd4_t(double d) { - simd4[0] = simd4[1] = _mm_set1_pd(d); - for (int i=N; i<4; ++i) _v4[i] = 0.0; - } + simd4_t(double d) {} simd4_t(double x, double y) : simd4_t(x,y,0,0) {} simd4_t(double x, double y, double z) : simd4_t(x,y,z,0) {} simd4_t(double x, double y, double z, double w) { simd4[0] = _mm_set_pd(y,x); simd4[1] = _mm_set_pd(w,z); } - explicit simd4_t(const __vec4d_t v) { - simd4[0] = _mm_loadu_pd(v); - simd4[1] = _mm_loadu_pd(v+2); - for (int i=N; i<4; ++i) _v4[i] = 0.0; - } - simd4_t(const simd4_t& v) { + explicit simd4_t(const __vec4d_t v) {} + template + simd4_t(const simd4_t& v) { simd4[0] = v.v4()[0]; simd4[1] = v.v4()[1]; } @@ -771,28 +777,6 @@ public: return vec; } - inline simd4_t& operator=(double d) { - simd4[0] = simd4[1] = _mm_set1_pd(d); - for (int i=N; i<4; ++i) _v4[i] = 0.0; - return *this; - } - inline simd4_t& operator=(const __vec4d_t v) { - simd4[0] = _mm_loadu_pd(v); - simd4[1] = _mm_loadu_pd(v+2); - for (int i=N; i<4; ++i) _v4[i] = 0.0; - return *this; - } - inline simd4_t& operator=(const simd4_t& v) { - simd4[0] = v.v4()[0]; - simd4[1] = v.v4()[1]; - return *this; - } - inline simd4_t& operator=(const __m128d v[2]) { - simd4[0] = v[0]; - simd4[1] = v[1]; - return *this; - } - inline simd4_t& operator+=(double d) { return operator+=(simd4_t(d)); } @@ -834,6 +818,38 @@ public: } }; +static const __m128d dmask3 = _mm_load_pd((const double*)(m3a64+2)); + +template<> +inline simd4_t::simd4_t(const __vec4d_t v) { + simd4[0] = _mm_loadu_pd(v); + simd4[1] = _mm_loadu_pd(v+2); +} +template<> +inline simd4_t::simd4_t(const __vec4d_t v) { + simd4[0] = _mm_loadu_pd(v); + simd4[1] = _mm_and_pd(_mm_loadu_pd(v+2), dmask3); +} +template<> +inline simd4_t::simd4_t(const __vec4d_t v) { + simd4[0] = _mm_loadu_pd(v); + simd4[1] = _mm_setzero_pd(); +} +template<> +inline simd4_t::simd4_t(double d) { + simd4[0] = simd4[1] = _mm_set1_pd(d); +} +template<> +inline simd4_t::simd4_t(double d) { + simd4[0] = _mm_set1_pd(d); + simd4[1] = _mm_and_pd(simd4[0], dmask3); +} +template<> +inline simd4_t::simd4_t(double d) { + simd4[0] = _mm_set1_pd(d); + simd4[1] = _mm_setzero_pd(); +} + namespace simd4 { // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86 @@ -924,25 +940,19 @@ private: union ALIGN16 { __m128i simd4; __vec4i_t vec; - int _v4[4]; } ALIGN16C; public: simd4_t(void) {} - simd4_t(int i) { - simd4 = _mm_set1_epi32(i); - for (int i=N; i<4; ++i) _v4[i] = 0; - } + simd4_t(int i) {} simd4_t(int x, int y) : simd4_t(x,y,0,0) {} simd4_t(int x, int y, int z) : simd4_t(x,y,z,0) {} simd4_t(int x, int y, int z, int w) { simd4 = _mm_set_epi32(w,z,y,x); } - explicit simd4_t(const __vec4i_t v) { - simd4 = _mm_loadu_si128((__m128i*)v); - for (int i=N; i<4; ++i) _v4[i] = 0; - } - simd4_t(const simd4_t& v) { + simd4_t(const __vec4i_t v) {} + template + simd4_t(const simd4_t& v) { simd4 = v.v4(); } simd4_t(const __m128i& v) { @@ -973,25 +983,6 @@ public: return vec; } - inline simd4_t& operator=(int i) { - simd4 = _mm_set1_epi32(i); - for (int i=N; i<4; ++i) _v4[i] = 0; - return *this; - } - inline simd4_t& operator=(const __vec4i_t v) { - simd4 = _mm_loadu_si128((__m128i*)v); - for (int i=N; i<4; ++i) _v4[i] = 0; - return *this; - } - inline simd4_t& operator=(const simd4_t& v) { - simd4 = v.v4(); - return *this; - } - inline simd4_t& operator=(const __m128i& v) { - simd4 = v; - return *this; - } - inline simd4_t& operator+=(int i) { return operator+=(simd4_t(i)); } @@ -1037,12 +1028,41 @@ public: template inline simd4_t& operator/=(const simd4_t& v) { for (int i=0; i +inline simd4_t::simd4_t(const __vec4i_t v) { + simd4 = _mm_loadu_si128((__m128i*)v); +} +template<> +inline simd4_t::simd4_t(const __vec4i_t v) { + simd4 = _mm_and_si128(_mm_loadu_si128((__m128i*)v), imask3); +} +template<> +inline simd4_t::simd4_t(const __vec4i_t v) { + simd4 = _mm_and_si128(_mm_loadu_si128((__m128i*)v), imask2); +} +template<> +inline simd4_t::simd4_t(int i) { + simd4 = _mm_set1_epi32(i); +} +template<> +inline simd4_t::simd4_t(int i) { + simd4 = _mm_and_si128(_mm_set1_epi32(i), imask3); +} +template<> +inline simd4_t::simd4_t(int i) { + simd4 = _mm_and_si128(_mm_set1_epi32(i), imask2); +} + + namespace simd4 { @@ -1065,7 +1085,7 @@ inline simd4_t max(simd4_t v1, const simd4_t& v2) { # endif # ifdef __ARM_NEON__ -# include +# include "simd_neon.hxx" # endif #endif /* __SIMD_H__ */