From 2e9efa98d7e216797451b6858339bc0286d971c4 Mon Sep 17 00:00:00 2001 From: Erik Hofman Date: Thu, 22 Dec 2016 13:43:04 +0100 Subject: [PATCH] Add accelearated cross product --- simgear/math/SGVec3.hxx | 8 +--- simgear/math/simd.hxx | 92 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 86 insertions(+), 14 deletions(-) diff --git a/simgear/math/SGVec3.hxx b/simgear/math/SGVec3.hxx index 5c936937..e3964157 100644 --- a/simgear/math/SGVec3.hxx +++ b/simgear/math/SGVec3.hxx @@ -330,12 +330,8 @@ normI(SGVec3 v) template inline SGVec3 -cross(const SGVec3& v1, const SGVec3& v2) -{ - return SGVec3(v1(1)*v2(2) - v1(2)*v2(1), - v1(2)*v2(0) - v1(0)*v2(2), - v1(0)*v2(1) - v1(1)*v2(0)); -} +cross(SGVec3 v1, const SGVec3& v2) +{ v1.simd3() = simd4::cross(v1.simd3(), v2.simd3()); return v1; } /// return any normalized vector perpendicular to v template diff --git a/simgear/math/simd.hxx b/simgear/math/simd.hxx index 3dfe5f65..9b419bb8 100644 --- a/simgear/math/simd.hxx +++ b/simgear/math/simd.hxx @@ -89,6 +89,16 @@ inline T dot(simd4_t v1, const simd4_t& v2) { return dp; } +template +inline simd4_t cross(const simd4_t& v1, const simd4_t& v2) +{ + simd4_t d; + d[0] = v1[1]*v2[2] - v1[2]*v2[1]; + d[1] = v1[2]*v2[0] - v1[0]*v2[2]; + d[2] = v1[0]*v2[1] - v1[1]*v2[0]; + return d; +} + } /* namespace simd4 */ @@ -245,26 +255,26 @@ inline simd4_t operator-(const simd4_t& v) { return r; } -template -inline simd4_t operator+(simd4_t v1, const simd4_t& v2) { +template +inline simd4_t operator+(simd4_t v1, const simd4_t& v2) { v1 += v2; return v1; } -template -inline simd4_t operator-(simd4_t v1, const simd4_t& v2) { +template +inline simd4_t operator-(simd4_t v1, const simd4_t& v2) { v1 -= v2; return v1; } -template -inline simd4_t operator*(simd4_t v1, const simd4_t& v2) { +template +inline simd4_t operator*(simd4_t v1, const simd4_t& v2) { v1 *= v2; return v1; } -template -inline simd4_t operator/(simd4_t v1, const simd4_t& v2) { +template +inline simd4_t operator/(simd4_t v1, const simd4_t& v2) { v1 /= v2; return v1; } @@ -441,6 +451,26 @@ inline float dot(simd4_t v1, const simd4_t& v2) { return hsum_ps_sse(v1.v4()*v2.v4()); } +template<> +inline simd4_t cross(const simd4_t& v1, const simd4_t& v2) +{ +#if 1 + // http://threadlocalmutex.com/?p=8 + __m128 a = _mm_shuffle_ps(v1.v4(), v1.v4(), _MM_SHUFFLE(3, 0, 2, 1)); + __m128 b = _mm_shuffle_ps(v2.v4(), v2.v4(), _MM_SHUFFLE(3, 0, 2, 1)); + __m128 c = _mm_sub_ps(_mm_mul_ps(v1.v4(), b), _mm_mul_ps(a, v2.v4())); + return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1)); +#else + v1.v4() = _mm_sub_ps( + _mm_mul_ps(_mm_shuffle_ps(v1.v4(),v1.v4(),_MM_SHUFFLE(3, 0, 2, 1)), + _mm_shuffle_ps(v2.v4(),v2.v4(),_MM_SHUFFLE(3, 1, 0, 2))), + _mm_mul_ps(_mm_shuffle_ps(v1.v4(),v1.v4(),_MM_SHUFFLE(3, 1, 0, 2)), + _mm_shuffle_ps(v2.v4(),v2.v4(),_MM_SHUFFLE(3, 0, 2, 1))) + ); + return v1; +#endif +} + template inline simd4_t min(simd4_t v1, const simd4_t& v2) { v1.v4() = _mm_min_ps(v1.v4(), v2.v4()); @@ -604,6 +634,29 @@ inline double dot(simd4_t v1, const simd4_t& v2) { return hsum_pd_avx(v1.v4()); } +template<> +inline simd4_t cross(const simd4_t& v1, const simd4_t& v2) +{ +#if 1 + // http://threadlocalmutex.com/?p=8 + __m256d a = _mm256_shuffle_pd(v1.v4(), v1.v4(), _MM_SHUFFLE(3, 0, 2, 1)); + __m256d b = _mm256_shuffle_pd(v2.v4(), v2.v4(), _MM_SHUFFLE(3, 0, 2, 1)); + __m256d c = _mm256_sub_pd(_mm256_mul_pd(v1.v4(), b), + _mm256_mul_pd(a, v2.v4())); + return _mm256_shuffle_pd(c, c, _MM_SHUFFLE(3, 0, 2, 1)); +#else + v1.v4() = _mm256_sub_pd( + _mm256_mul_pd( + _mm256_shuffle_pd(v1.v4(),v1.v4(),_MM_SHUFFLE(3, 0, 2, 1)), + _mm256_shuffle_pd(v2.v4(),v2.v4(),_MM_SHUFFLE(3, 1, 0, 2))), + _mm256_mul_pd( + _mm256_shuffle_pd(v1.v4(),v1.v4(),_MM_SHUFFLE(3, 1, 0, 2)), + _mm256_shuffle_pd(v2.v4(),v2.v4(),_MM_SHUFFLE(3, 0, 2, 1))) + ); + return v1; +#endif +} + template inline simd4_t min(simd4_t v1, const simd4_t& v2) { v1.v4() = _mm256_min_pd(v1.v4(), v2.v4()); @@ -775,6 +828,29 @@ inline double dot(simd4_t v1, const simd4_t& v2) { return hsum_pd_sse(v1.v4()); } +#if 1 +template<> +inline simd4_t cross(const simd4_t& v1, const simd4_t& v2) +{ + __m128d a[2], b[2], c[2]; + simd4_t r; + + a[0] = _mm_shuffle_pd(v1.v4()[0], v1.v4()[1], _MM_SHUFFLE2(0, 1)); + a[1] = _mm_shuffle_pd(v1.v4()[0], v1.v4()[1], _MM_SHUFFLE2(1, 0)); + + b[0] = _mm_shuffle_pd(v2.v4()[0], v2.v4()[1], _MM_SHUFFLE2(0, 1)); + b[1] = _mm_shuffle_pd(v2.v4()[0], v2.v4()[1], _MM_SHUFFLE2(1, 0)); + + c[0] = _mm_sub_pd(_mm_mul_pd(v1.v4()[0], b[0]), _mm_mul_pd(a[0], v2.v4()[0])); + c[1] = _mm_sub_pd(_mm_mul_pd(v1.v4()[1], b[1]), _mm_mul_pd(a[1], v2.v4()[1])); + + r.v4()[0] = _mm_shuffle_pd(c[0], c[1], _MM_SHUFFLE2(0, 1)); + r.v4()[1] = _mm_shuffle_pd(c[0], c[1], _MM_SHUFFLE2(1, 0)); + + return r; +} +#endif + template inline simd4_t min(simd4_t v1, const simd4_t& v2) { v1.v4()[0] = _mm_min_pd(v1.v4()[0], v2.v4()[0]);