diff --git a/simgear/math/simd.hxx b/simgear/math/simd.hxx index 9c0bbc01..27829cc6 100644 --- a/simgear/math/simd.hxx +++ b/simgear/math/simd.hxx @@ -355,7 +355,7 @@ public: simd4_t(float x, float y, float z, float w) { simd4 = _mm_set_ps(w,z,y,x); } - explicit simd4_t(const __vec4f_t v) { + simd4_t(const __vec4f_t v) { simd4 = _mm_loadu_ps(v); for (int i=N; i<4; ++i) _v4[i] = 0.0f; } @@ -1064,5 +1064,9 @@ inline simd4_t max(simd4_t v1, const simd4_t& v2) { # endif +# ifdef __ARM_NEON__ +# include +# endif + #endif /* __SIMD_H__ */ diff --git a/simgear/math/simd4x4.hxx b/simgear/math/simd4x4.hxx index 0f355fe3..104c1ff2 100644 --- a/simgear/math/simd4x4.hxx +++ b/simgear/math/simd4x4.hxx @@ -1269,5 +1269,10 @@ inline simd4_t transform(const simd4x4_t& m, const simd4_t +# endif + + #endif /* __SIMD4X4_H__ */ diff --git a/simgear/math/simd4x4_neon.hxx b/simgear/math/simd4x4_neon.hxx new file mode 100644 index 00000000..983eac39 --- /dev/null +++ b/simgear/math/simd4x4_neon.hxx @@ -0,0 +1,706 @@ +// Copyright (C) 2016 Erik Hofman - erik@ehofman.com +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Library General Public +// License as published by the Free Software Foundation; either +// version 2 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Library General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +// + +#ifndef __SIMD4X4_NEON_H__ +#define __SIMD4X4_NEON_H__ 1 + +#include "simd_neon.hxx" + +#ifdef __ARM_NEON__ +template<> +class simd4x4_t +{ +private: + typedef float __mtx4f_t[4][4]; + + union ALIGN16 { + float32x4_t simd4x4[4]; + __mtx4f_t mtx; + float array[4*4]; + } ALIGN16C; + +public: + simd4x4_t(void) {} + simd4x4_t(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) + { + array[0] = m00; array[1] = m10; + array[2] = m20; array[3] = m30; + array[4] = m01; array[5] = m11; + array[6] = m21; array[7] = m31; + array[8] = m02; array[9] = m12; + array[10] = m22; array[11] = m32; + array[12] = m03; array[13] = m13; + array[14] = m23; array[15] = m33; + } + simd4x4_t(const float m[4*4]) { + for (int i=0; i<4; ++i) { + simd4x4[i] = simd4_t((const float*)&m[4*i]).v4(); + } + } + + explicit simd4x4_t(const __mtx4f_t m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = simd4_t(m[i]).v4(); + } + } + simd4x4_t(const simd4x4_t& m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = m.m4x4()[i]; + } + } + ~simd4x4_t(void) {} + + inline float32x4_t (&m4x4(void))[4] { + return simd4x4; + } + + inline const float32x4_t (&m4x4(void) const)[4] { + return simd4x4; + } + + inline const float (&ptr(void) const)[4][4] { + return mtx; + } + + inline float (&ptr(void))[4][4] { + return mtx; + } + + inline operator const float*(void) const { + return array; + } + + inline operator float*(void) { + return array; + } + + template + inline void set(int i, const simd4_t& v) { + simd4x4[i] = v.v4(); + } + + inline simd4x4_t& operator=(const __mtx4f_t m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = simd4_t(m[i]).v4(); + } + return *this; + } + inline simd4x4_t& operator=(const simd4x4_t& m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = m.m4x4()[i]; + } + return *this; + } + + inline simd4x4_t& operator+=(const simd4x4_t& m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = vaddq_f32(simd4x4[i], m.m4x4()[i]); + } + return *this; + } + + inline simd4x4_t& operator-=(const simd4x4_t& m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = vsubq_f32(simd4x4[i], m.m4x4()[i]); + } + return *this; + } + + inline simd4x4_t& operator*=(float f) { + simd4_t f4(f); + for (int i=0; i<4; ++i) { + simd4x4[i] = vmulq_f32(simd4x4[i], f4.v4()); + } + return *this; + } + + simd4x4_t& operator*=(const simd4x4_t& m2) { + simd4x4_t m1 = *this; + float32x4_t row, col; + for (int i=0; i<4; ++i) { + col = vdupq_n_f32(m2.ptr()[i][0]); + row = vmulq_f32(m1.m4x4()[0], col); + for (int j=1; j<4; ++j) { + col = vdupq_n_f32(m2.ptr()[i][j]); + row = vaddq_f32(row, vmulq_f32(m1.m4x4()[j], col)); + } + simd4x4[i] = row; + } + return *this; + } +}; + +template +inline simd4_t operator*(const simd4x4_t& m, const simd4_t& vi) +{ + float32x4_t mv = vmulq_f32(m.m4x4()[0], vdupq_n_f32(vi.ptr()[0])); + for (int i=1; i +inline simd4x4_t rotation_matrix(float angle, const simd4_t& axis) +{ + float s = std::sin(angle), c = std::cos(angle), t = 1.0-c; + simd4_t axt, at = axis*t, as = axis*s; + simd4x4_t m; + + simd4x4::unit(m); + axt = axis.ptr()[0]*at; + m.m4x4()[0] = axt.v4(); + + axt = axis.ptr()[1]*at; + m.ptr()[0][0] += c; + m.ptr()[0][1] += as.ptr()[2]; + m.ptr()[0][2] -= as.ptr()[1]; + + m.m4x4()[1] = axt.v4(); + + axt = axis.ptr()[2]*at; + m.ptr()[1][0] -= as.ptr()[2]; + m.ptr()[1][1] += c; + m.ptr()[1][2] += as.ptr()[0]; + + m.m4x4()[2] = axt.v4(); + + m.ptr()[2][0] += as.ptr()[1]; + m.ptr()[2][1] -= as.ptr()[0]; + m.ptr()[2][2] += c; + + return m; +} + +template<> +inline simd4x4_t transpose(simd4x4_t m) { +// http://clb.demon.fi/MathGeoLib/nightly/docs/float4x4_neon.h_code.html + float32x4x4_t x = vld4q_f32(m); + vst1q_f32(&m[0], x.val[0]); + vst1q_f32(&m[4], x.val[1]); + vst1q_f32(&m[8], x.val[2]); + vst1q_f32(&m[12], x.val[3]); + return m; +} + +inline void translate(simd4x4_t& m, const simd4_t& dist) { + m.m4x4()[3] = vsubq_f32(m.m4x4()[3], dist.v4()); +} + +template +inline void pre_translate(simd4x4_t& m, const simd4_t& dist) +{ + simd4x4_t mt = simd4x4::transpose(m); + float32x4_t row3 = mt.m4x4()[3]; + for (int i=0; i<3; ++i) { + float32x4_t t = vdupq_n_f32(float(dist[i])); + mt.m4x4()[i] = vaddq_f32(mt.m4x4()[i], vmulq_f32(t, row3)); + } + m = simd4x4::transpose(mt); +} + +template +inline void post_translate(simd4x4_t& m, const simd4_t& dist) +{ + float32x4_t col3 = m.m4x4()[3]; + for (int i=0; i<3; ++i) { + float32x4_t t = vdupq_n_f32(float(dist[i])); + col3 = vaddq_f32(col3, vmulq_f32(t, m.m4x4()[i])); + } + m.m4x4()[3] = col3; +} + +template<> +inline simd4_t transform(const simd4x4_t& m, const simd4_t& pt) { + float32x4_t tpt = m.m4x4()[3]; + for (int i=0; i<3; ++i) { + float32x4_t ptd = vdupq_n_f32(pt[i]); + tpt = vaddq_f32(tpt, vmulq_f32(ptd, m.m4x4()[i])); + } + vsetq_lane_f32(0.0f, tpt, 3); + return tpt; +} + +} /* namespace simd4x */ + + + +#if 0 + +template<> +class simd4x4_t +{ +private: + typedef double __mtx4d_t[4][4]; + + union ALIGN32 { + __m256d simd4x4[4]; + __mtx4d_t mtx; + double array[4*4]; + } ALIGN32C; + +public: + simd4x4_t(void) {} + simd4x4_t(double m00, double m01, double m02, double m03, + double m10, double m11, double m12, double m13, + double m20, double m21, double m22, double m23, + double m30, double m31, double m32, double m33) + { + simd4x4[0] = _mm256_set_pd(m30,m20,m10,m00); + simd4x4[1] = _mm256_set_pd(m31,m21,m11,m01); + simd4x4[2] = _mm256_set_pd(m32,m22,m12,m02); + simd4x4[3] = _mm256_set_pd(m33,m23,m13,m03); + } + explicit simd4x4_t(const double m[4*4]) { + for (int i=0; i<4; ++i) { + simd4x4[i] = simd4_t((const double*)&m[4*i]).v4(); + } + } + + explicit simd4x4_t(const __mtx4d_t m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = simd4_t(m[i]).v4(); + } + } + simd4x4_t(const simd4x4_t& m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = m.m4x4()[i]; + } + } + ~simd4x4_t(void) {} + + inline __m256d (&m4x4(void))[4] { + return simd4x4; + } + + inline const __m256d (&m4x4(void) const)[4] { + return simd4x4; + } + + inline const double (&ptr(void) const)[4][4] { + return mtx; + } + + inline double (&ptr(void))[4][4] { + return mtx; + } + + inline operator const double*(void) const { + return array; + } + + inline operator double*(void) { + return array; + } + + template + inline void set(int i, const simd4_t& v) { + simd4x4[i] = v.v4(); + } + + inline simd4x4_t& operator=(const __mtx4d_t m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = simd4_t(m[i]).v4(); + } + return *this; + } + inline simd4x4_t& operator=(const simd4x4_t& m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = m.m4x4()[i]; + } + return *this; + } + + inline simd4x4_t& operator+=(const simd4x4_t& m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = _mm256_add_pd(simd4x4[i], m.m4x4()[i]); + } + return *this; + } + + inline simd4x4_t& operator-=(const simd4x4_t& m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = _mm256_sub_pd(simd4x4[i], m.m4x4()[i]); + } + return *this; + } + + inline simd4x4_t& operator*=(double f) { + simd4_t f4(f); + for (int i=0; i<4; ++i) { + simd4x4[i] = _mm256_mul_pd(simd4x4[i], f4.v4()); + } + return *this; + } + + simd4x4_t& operator*=(const simd4x4_t& m2) { + simd4x4_t m1 = *this; + __m256d row, col; + for (int i=0; i<4; ++i ) { + col = _mm256_set1_pd(m2.ptr()[i][0]); + row = _mm256_mul_pd(m1.m4x4()[0], col); + for (int j=1; j<4; ++j) { + col = _mm256_set1_pd(m2.ptr()[i][j]); + row = _mm256_add_pd(row, _mm256_mul_pd(m1.m4x4()[j], col)); + } + simd4x4[i] = row; + } + return *this; + } + +}; + +template +inline simd4_t operator*(const simd4x4_t& m, const simd4_t& vi) +{ + __m256d mv = _mm256_mul_pd(m.m4x4()[0], _mm256_set1_pd(vi.ptr()[0])); + for (int i=1; i +inline simd4x4_t rotation_matrix(double angle, const simd4_t& axis) +{ + double s = std::sin(angle), c = std::cos(angle), t = 1.0-c; + simd4_t axt, at = axis*t, as = axis*s; + simd4x4_t m; + + simd4x4::unit(m); + axt = axis.ptr()[0]*at; + m.m4x4()[0] = axt.v4(); + + axt = axis.ptr()[1]*at; + m.ptr()[0][0] += c; + m.ptr()[0][1] += as.ptr()[2]; + m.ptr()[0][2] -= as.ptr()[1]; + + m.m4x4()[1] = axt.v4(); + + axt = axis.ptr()[2]*at; + m.ptr()[1][0] -= as.ptr()[2]; + m.ptr()[1][1] += c; + m.ptr()[1][2] += as.ptr()[0]; + + m.m4x4()[2] = axt.v4(); + + m.ptr()[2][0] += as.ptr()[1]; + m.ptr()[2][1] -= as.ptr()[0]; + m.ptr()[2][2] += c; + + return m; +} + +template<> +inline simd4x4_t transpose(simd4x4_t m) { +// http://stackoverflow.com/questions/36167517/m256d-transpose4-equivalent + __m256d tmp0 = _mm256_shuffle_pd(m.m4x4()[0], m.m4x4()[1], 0x0); + __m256d tmp2 = _mm256_shuffle_pd(m.m4x4()[0], m.m4x4()[1], 0xF); + __m256d tmp1 = _mm256_shuffle_pd(m.m4x4()[2], m.m4x4()[3], 0x0); + __m256d tmp3 = _mm256_shuffle_pd(m.m4x4()[2], m.m4x4()[3], 0xF); + + m.m4x4()[0] = _mm256_permute2f128_pd(tmp0, tmp1, 0x20); + m.m4x4()[1] = _mm256_permute2f128_pd(tmp2, tmp3, 0x20); + m.m4x4()[2] = _mm256_permute2f128_pd(tmp0, tmp1, 0x31); + m.m4x4()[3] = _mm256_permute2f128_pd(tmp2, tmp3, 0x31); + + return m; +} + +inline void translate(simd4x4_t& m, const simd4_t& dist) { + m.m4x4()[3] = _mm256_sub_pd(m.m4x4()[3], dist.v4()); +} + +template +inline void pre_translate(simd4x4_t& m, const simd4_t& dist) +{ + simd4_t row3(m.ptr()[0][3],m.ptr()[1][3],m.ptr()[2][3],m.ptr()[3][3]); + for (int i=0; i<3; ++i) { + for (int j=0; j<4; ++j) { + m.ptr()[j][i] += row3[j]*double(dist[i]); + } + } +#if 0 + // this is slower + simd4x4_t mt = simd4x4::transpose(m); + __mm256d row3 = mt.m4x4()[3]; + for (int i=0; i<3; ++i) { + __mm256d _mm256_set1_pd(float(dist[i])); + mt.m4x4()[i] = _mm256_add_pd(mt.m4x4()[i], _mm256_mul_pd(t, row3)); + } + m = simd4x4::transpose(mt); +#endif +} + +template +inline void post_translate(simd4x4_t& m, const simd4_t& dist) { + __m256d col3 = m.m4x4()[3]; + for (int i=0; i<3; ++i) { + __m256d t = _mm256_set1_pd(double(dist[i])); + col3 = _mm256_add_pd(col3, _mm256_mul_pd(t, m.m4x4()[i])); + } + m.m4x4()[3] = col3; +} + +template<> +inline simd4_t transform(const simd4x4_t& m, const simd4_t& pt) { + simd4_t res; + __m256d tpt = m.m4x4()[3]; + for (int i=0; i<3; ++i) { + __m256d ptd = _mm256_set1_pd(pt[i]); + tpt = _mm256_add_pd(tpt, _mm256_mul_pd(ptd, m.m4x4()[i])); + } + res = tpt; + res[3] = 0.0; + return res; +} + +} /* namespace simd4x4 */ +# endif + + +template<> +class simd4x4_t +{ +private: + typedef int __mtx4i_t[4][4]; + + union ALIGN16 { + int32x4_t simd4x4[4]; + __mtx4i_t mtx; + int array[4*4]; + } ALIGN16C; + +public: + simd4x4_t(void) {} + simd4x4_t(int m00, int m01, int m02, int m03, + int m10, int m11, int m12, int m13, + int m20, int m21, int m22, int m23, + int m30, int m31, int m32, int m33) + { + array[0] = m00; array[1] = m10; + array[2] = m20; array[3] = m30; + array[4] = m01; array[5] = m11; + array[6] = m21; array[7] = m31; + array[8] = m02; array[9] = m12; + array[10] = m22; array[11] = m32; + array[12] = m03; array[13] = m13; + array[14] = m23; array[15] = m33; + } + simd4x4_t(const int m[4*4]) { + for (int i=0; i<4; ++i) { + simd4x4[i] = simd4_t((const int*)&m[4*i]).v4(); + } + } + explicit simd4x4_t(const __mtx4i_t m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = simd4_t(m[i]).v4(); + } + } + simd4x4_t(const simd4x4_t& m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = m.m4x4()[i]; + } + } + ~simd4x4_t(void) {} + + inline int32x4_t (&m4x4(void))[4] { + return simd4x4; + } + + inline const int32x4_t (&m4x4(void) const)[4] { + return simd4x4; + } + + inline const int (&ptr(void) const)[4][4] { + return mtx; + } + + inline int (&ptr(void))[4][4] { + return mtx; + } + + inline operator const int*(void) const { + return array; + } + + inline operator int*(void) { + return array; + } + + template + inline void set(int i, const simd4_t& v) { + simd4x4[i] = v.v4(); + } + + inline simd4x4_t& operator=(const __mtx4i_t m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = simd4_t(m[i]).v4(); + } + return *this; + } + inline simd4x4_t& operator=(const simd4x4_t& m) { + for (int i=0; i<4; ++i) { + simd4x4[i] = m.m4x4()[i]; + } + return *this; + } + + inline simd4x4_t& operator+=(const simd4x4_t& m) { + for (int i=0; i<4; ++i) { + simd4x4[i] += m.m4x4()[i]; + } + return *this; + } + + inline simd4x4_t& operator-=(const simd4x4_t& m) { + for (int i=0; i<4; ++i) { + simd4x4[i] -= m.m4x4()[i]; + } + return *this; + } + + inline simd4x4_t& operator*=(int f) { + simd4_t f4(f); + for (int i=0; i<4; ++i) { + simd4x4[i] *= f4.v4(); + } + return *this; + } + + simd4x4_t& operator*=(const simd4x4_t& m2) { + simd4x4_t m1 = *this; + simd4_t row, col; + + for (int i=0; i<4; ++i) { + simd4_t col(m2.ptr()[i][0]); + row.v4() = m1.m4x4()[0] * col.v4(); + for (int j=1; j<4; ++j) { + simd4_t col(m2.ptr()[i][j]); + row.v4() += m1.m4x4()[j] * col.v4(); + } + simd4x4[i] = row.v4(); + } + return *this; + } +}; + +template +inline simd4_t operator*(const simd4x4_t& m, const simd4_t& vi) +{ + simd4_t mv(m.m4x4()[0]); + mv *= vi.ptr()[0]; + for (int i=1; i row(m.m4x4()[i]); + row *= vi.ptr()[i]; + mv.v4() += row.v4(); + } + for (int i=M; i<4; ++i) mv[i] = 0; + return mv; +} + +template<> +inline simd4x4_t operator*(const simd4x4_t& m1, const simd4x4_t& m2) +{ + simd4_t row, col; + simd4x4_t m; + + for (int i=0; i<4; ++i) { + simd4_t col(m2.ptr()[i][0]); + row.v4() = m1.m4x4()[0] * col.v4(); + for (int j=1; j<4; ++j) { + simd4_t col(m2.ptr()[i][j]); + row.v4() += m1.m4x4()[j] * col.v4(); + } + m.m4x4()[i] = row.v4(); + } + + return m; +} + +namespace simd4x4 +{ + +template<> +inline simd4x4_t transpose(simd4x4_t m) { + int32x4x4_t x = vld4q_s32(m); + vst1q_s32(&m[0], x.val[0]); + vst1q_s32(&m[4], x.val[1]); + vst1q_s32(&m[8], x.val[2]); + vst1q_s32(&m[12], x.val[3]); + return m; +} + +inline void translate(simd4x4_t& m, const simd4_t& dist) { + m.m4x4()[3] = vsubq_s32(m.m4x4()[3], dist.v4()); +} + +template +inline void pre_translate(simd4x4_t& m, const simd4_t& dist) +{ + simd4x4_t mt = simd4x4::transpose(m); + simd4_t row3(mt.ptr()[3]); + for (int i=0; i<3; ++i) { + simd4_t trow3 = int(dist[i]); + trow3 *= row3.v4(); + mt.m4x4()[i] = vaddq_s32(mt.m4x4()[i], trow3.v4()); + } + m = simd4x4::transpose(mt); +} + +template +inline void post_translate(simd4x4_t& m, const simd4_t& dist) +{ + int32x4_t col3 = m.m4x4()[3]; + for (int i=0; i<3; ++i) { + simd4_t trow3 = int(dist[i]); + trow3 *= m.m4x4()[i]; + col3 = vaddq_s32(col3, trow3.v4()); + } + m.m4x4()[3] = col3; +} + +template<> +inline simd4_t transform(const simd4x4_t& m, const simd4_t& pt) { + simd4_t tpt = m.m4x4()[3]; + for (int i=0; i<3; ++i) { + simd4_t ptd = m.m4x4()[i]; + ptd *= pt[i]; + tpt.v4() = vaddq_s32(tpt.v4(), ptd.v4()); + } + tpt[3] = 0.0; + return tpt; +} + + +} /* namespace simd4x */ +#endif + +#endif /* __SIMD4X4_NEON_H__ */ + diff --git a/simgear/math/simd_neon.hxx b/simgear/math/simd_neon.hxx index 1ca5f0a3..11a2614b 100644 --- a/simgear/math/simd_neon.hxx +++ b/simgear/math/simd_neon.hxx @@ -40,23 +40,22 @@ class simd4_t private: typedef float __vec4f_t[N]; - union { + union ALIGN16 { float32x4_t simd4; __vec4f_t vec; float _v4[4]; - }; + } ALIGN16C; public: simd4_t(void) {} simd4_t(float f) { - simd4 = vdupq_n_f3(f); + simd4 = vdupq_n_f32(f); for (int i=N; i<4; ++i) _v4[i] = 0.0f; } simd4_t(float x, float y) : simd4_t(x,y,0,0) {} simd4_t(float x, float y, float z) : simd4_t(x,y,z,0) {} simd4_t(float x, float y, float z, float w) { - ALIGN16 float ALIGN16C data[4] = { x, y, z, w }; - simd4 = vld1q_f32(data); + _v4[0] = x; _v4[1] = y; _v4[2] = z; _v4[3] = w; } simd4_t(const __vec4f_t v) { simd4 = vld1q_f32(v); @@ -91,7 +90,7 @@ public: } inline simd4_t& operator=(float f) { - simd4 = vdupq_n_f3(f); + simd4 = vdupq_n_f32(f); for (int i=N; i<4; ++i) _v4[i] = 0.0f; return *this; } @@ -142,7 +141,7 @@ public: template inline simd4_t& operator/=(const simd4_t& v) { // http://stackoverflow.com/questions/6759897/how-to-divide-in-neon-intrinsics-by-a-float-number - float32x2_t recip = vrecpeq_f32(v.v4()); + float32x4_t recip = vrecpeq_f32(v.v4()); recip = vmulq_f32(vrecpsq_f32(v.v4(), recip), recip); recip = vmulq_f32(vrecpsq_f32(v.v4(), recip), recip); simd4 = vmulq_f32(simd4, recip); @@ -168,12 +167,12 @@ inline float dot(simd4_t v1, const simd4_t& v2) { return hsum_float32x4_neon(v1.v4()*v2.v4()); } -// https://github.com/scoopr/vectorial/blob/master/include/vectorial/float32x4_t_neon.h +// https://github.com/scoopr/vectorial/blob/master/include/vectorial/simd4f_neon.h template<> inline simd4_t cross(const simd4_t& v1, const simd4_t& v2) { - static const uint32_t mask[] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0}; - static const int32x4_t mask = vld1q_s32((const int32_t*)mask); + static const uint32_t mask_a[] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0}; + static const int32x4_t mask = vld1q_s32((const int32_t*)mask_a); // Compute v1 and v2 in order yzx float32x2_t v1_low = vget_low_f32(v1.v4()); @@ -181,13 +180,14 @@ inline simd4_t cross(const simd4_t& v1, const simd4_t float32x4_t v1_yzx = vcombine_f32(vext_f32(v1_low, vget_high_f32(v1.v4()),1), v1_low); float32x4_t v2_yzx = vcombine_f32(vext_f32(v2_low, vget_high_f32(v2.v4()),1), v2_low); // Compute cross in order zxy - float32x4_t s3 = float32x4_t_sub(float32x4_t_mul(v2_yzx, v1.v4()), float32x4_t_mul(v1_yzx, v2.v4())); + float32x4_t s3 = vsubq_f32(vmulq_f32(v2_yzx, v1.v4()), vmulq_f32(v1_yzx, v2.v4())); // Permute cross to order xyz and zero out the fourth value float32x2_t low = vget_low_f32(s3); s3 = vcombine_f32(vext_f32(low, vget_high_f32(s3), 1), low); return (float32x4_t)vandq_s32((int32x4_t)s3,mask); } + template inline simd4_t min(simd4_t v1, const simd4_t& v2) { v1 = vminq_f32(v1.v4(), v2.v4()); @@ -386,11 +386,11 @@ class simd4_t private: typedef int __vec4i_t[N]; - union { + union ALIGN16 { int32x4_t simd4; __vec4i_t vec; int _v4[4]; - }; + } ALIGN16C; public: simd4_t(void) {} @@ -401,11 +401,10 @@ public: simd4_t(int x, int y) : simd4_t(x,y,0,0) {} simd4_t(int x, int y, int z) : simd4_t(x,y,z,0) {} simd4_t(int x, int y, int z, int w) { - ALIGN16 int32_t ALIGN16C data[4] = { x, y, z, w }; - simd4 = vld1q_s32(data); + _v4[0] = x; _v4[1] = y; _v4[2] = z; _v4[3] = w; } simd4_t(const __vec4i_t v) { - simd4 = vld1q_s32((int32x4_t*)v); + simd4 = vld1q_s32(v); for (int i=N; i<4; ++i) _v4[i] = 0; } simd4_t(const simd4_t& v) { @@ -445,7 +444,7 @@ public: return *this; } inline simd4_t& operator=(const __vec4i_t v) { - simd4 = vld1q_s32((int32x4_t*)v); + simd4 = vld1q_s32(v); for (int i=N; i<4; ++i) _v4[i] = 0; return *this; } @@ -484,7 +483,7 @@ public: return operator*=(v.v4()); } inline simd4_t& operator*=(const int32x4_t& v) { - simd4 = vmulq_s32(simd4, v.v4()); + simd4 = vmulq_s32(simd4, v); return *this; }