Add an ARM NEON readu simd4x4_neon.hxx and small fixes for simd_neon.hxx

2017-01-19 13:20:52 +01:00
parent c4ea62a899
commit 061fea48c8
4 changed files with 733 additions and 19 deletions
--- a/simgear/math/simd.hxx
+++ b/simgear/math/simd.hxx
@@ -355,7 +355,7 @@ public:
    simd4_t(float x, float y, float z, float w) {
        simd4 = _mm_set_ps(w,z,y,x);
    }
-    explicit simd4_t(const __vec4f_t v) {
+    simd4_t(const __vec4f_t v) {
        simd4 = _mm_loadu_ps(v);
        for (int i=N; i<4; ++i) _v4[i] = 0.0f;
    }
@@ -1064,5 +1064,9 @@ inline simd4_t<int,N> max(simd4_t<int,N> v1, const simd4_t<int,N>& v2) {

 # endif

+# ifdef __ARM_NEON__
+#  include <simgear/math/simd_neon.hxx>
+# endif
+
 #endif /* __SIMD_H__ */

--- a/simgear/math/simd4x4.hxx
+++ b/simgear/math/simd4x4.hxx
@@ -1269,5 +1269,10 @@ inline simd4_t<int,3> transform<int>(const simd4x4_t<int,4>& m, const simd4_t<in
 } /* namespace simd4x */
 # endif

+# ifdef __ARM_NEON__
+#  include <simgear/math/simd4x4_neon.hxx>
+# endif
+
+
 #endif /* __SIMD4X4_H__ */

--- a/simgear/math/simd4x4_neon.hxx
+++ b/simgear/math/simd4x4_neon.hxx
@@ -0,0 +1,706 @@
+// Copyright (C) 2016 Erik Hofman - erik@ehofman.com
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Library General Public
+// License as published by the Free Software Foundation; either
+// version 2 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Library General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+//
+
+#ifndef __SIMD4X4_NEON_H__
+#define __SIMD4X4_NEON_H__	1
+
+#include "simd_neon.hxx"
+
+#ifdef __ARM_NEON__
+template<>
+class simd4x4_t<float,4>
+{
+private:
+    typedef float  __mtx4f_t[4][4];
+
+    union ALIGN16 {
+        float32x4_t simd4x4[4];
+        __mtx4f_t mtx;
+        float array[4*4];
+    } ALIGN16C;
+
+public:
+    simd4x4_t(void) {}
+    simd4x4_t(float m00, float m01, float m02, float m03,
+              float m10, float m11, float m12, float m13,
+              float m20, float m21, float m22, float m23,
+              float m30, float m31, float m32, float m33)
+    {
+        array[0] = m00; array[1] = m10;
+        array[2] = m20; array[3] = m30;
+        array[4] = m01; array[5] = m11;
+        array[6] = m21; array[7] = m31;
+        array[8] = m02; array[9] = m12;
+        array[10] = m22; array[11] = m32;
+        array[12] = m03; array[13] = m13;
+        array[14] = m23; array[15] = m33;
+    }
+    simd4x4_t(const float m[4*4]) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = simd4_t<float,4>((const float*)&m[4*i]).v4();
+        }
+    }
+
+    explicit simd4x4_t(const __mtx4f_t m) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = simd4_t<float,4>(m[i]).v4();
+        }
+    }
+    simd4x4_t(const simd4x4_t<float,4>& m) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = m.m4x4()[i];
+        }
+    }
+    ~simd4x4_t(void) {}
+
+    inline float32x4_t (&m4x4(void))[4] {
+        return simd4x4;
+    }
+
+    inline const float32x4_t (&m4x4(void) const)[4] {
+        return simd4x4;
+    }
+    
+    inline const float (&ptr(void) const)[4][4] {
+        return mtx;
+    }
+
+    inline float (&ptr(void))[4][4] {
+        return mtx;
+    }
+
+    inline operator const float*(void) const {
+        return array;
+    }
+
+    inline operator float*(void) {
+        return array;
+    }
+
+    template<int M>
+    inline void set(int i, const simd4_t<float,M>& v) {
+        simd4x4[i] = v.v4();
+    }
+
+    inline simd4x4_t<float,4>& operator=(const __mtx4f_t m) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = simd4_t<float,4>(m[i]).v4();
+        }
+        return *this;
+    }
+    inline simd4x4_t<float,4>& operator=(const simd4x4_t<float,4>& m) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = m.m4x4()[i];
+        }
+        return *this;
+    }
+
+    inline simd4x4_t<float,4>& operator+=(const simd4x4_t<float,4>& m) {
+        for (int i=0; i<4; ++i) {
+           simd4x4[i] = vaddq_f32(simd4x4[i], m.m4x4()[i]);
+        }
+        return *this;
+    }
+
+    inline simd4x4_t<float,4>& operator-=(const simd4x4_t<float,4>& m) {
+        for (int i=0; i<4; ++i) {
+           simd4x4[i] = vsubq_f32(simd4x4[i], m.m4x4()[i]);
+        }
+        return *this;
+    }
+
+    inline simd4x4_t<float,4>& operator*=(float f) {
+        simd4_t<float,4> f4(f);
+        for (int i=0; i<4; ++i) {
+           simd4x4[i] = vmulq_f32(simd4x4[i], f4.v4());
+        }
+        return *this;
+    }
+
+    simd4x4_t<float,4>& operator*=(const simd4x4_t<float,4>& m2) {
+        simd4x4_t<float,4> m1 = *this;
+        float32x4_t row, col;
+        for (int i=0; i<4; ++i) {
+            col = vdupq_n_f32(m2.ptr()[i][0]);
+            row = vmulq_f32(m1.m4x4()[0], col);
+            for (int j=1; j<4; ++j) {
+                col = vdupq_n_f32(m2.ptr()[i][j]);
+                row = vaddq_f32(row, vmulq_f32(m1.m4x4()[j], col));
+            }
+            simd4x4[i] = row;
+        }
+        return *this;
+    }
+};
+
+template<int M>
+inline simd4_t<float,M> operator*(const simd4x4_t<float,4>& m, const simd4_t<float,M>& vi)
+{
+    float32x4_t mv = vmulq_f32(m.m4x4()[0], vdupq_n_f32(vi.ptr()[0]));
+    for (int i=1; i<M; ++i) {
+        float32x4_t row = vmulq_f32(m.m4x4()[i], vdupq_n_f32(vi.ptr()[i]));
+        mv = vaddq_f32(mv, row);
+    }
+    return mv;
+}
+
+namespace simd4x4
+{
+
+template<>
+inline simd4x4_t<float,4> rotation_matrix<float>(float angle, const simd4_t<float,3>& axis)
+{
+    float s = std::sin(angle), c = std::cos(angle), t = 1.0-c;
+    simd4_t<float,3> axt, at = axis*t, as = axis*s;
+    simd4x4_t<float,4> m;
+
+    simd4x4::unit(m);
+    axt = axis.ptr()[0]*at;
+    m.m4x4()[0] = axt.v4();
+
+    axt = axis.ptr()[1]*at;
+    m.ptr()[0][0] += c;
+    m.ptr()[0][1] += as.ptr()[2];
+    m.ptr()[0][2] -= as.ptr()[1];
+
+    m.m4x4()[1] = axt.v4();
+
+    axt = axis.ptr()[2]*at;
+    m.ptr()[1][0] -= as.ptr()[2];
+    m.ptr()[1][1] += c;
+    m.ptr()[1][2] += as.ptr()[0];
+
+    m.m4x4()[2] = axt.v4();
+
+    m.ptr()[2][0] += as.ptr()[1];
+    m.ptr()[2][1] -= as.ptr()[0];
+    m.ptr()[2][2] += c;
+
+    return m;
+}
+
+template<>
+inline simd4x4_t<float,4> transpose<float>(simd4x4_t<float,4> m) {
+// http://clb.demon.fi/MathGeoLib/nightly/docs/float4x4_neon.h_code.html
+    float32x4x4_t x = vld4q_f32(m);
+    vst1q_f32(&m[0], x.val[0]);
+    vst1q_f32(&m[4], x.val[1]);
+    vst1q_f32(&m[8], x.val[2]);
+    vst1q_f32(&m[12], x.val[3]);
+    return m;
+}
+
+inline void translate(simd4x4_t<float,4>& m, const simd4_t<float,3>& dist) {
+    m.m4x4()[3] = vsubq_f32(m.m4x4()[3], dist.v4());
+}
+
+template<typename S>
+inline void pre_translate(simd4x4_t<float,4>& m, const simd4_t<S,3>& dist)
+{
+    simd4x4_t<float,4> mt = simd4x4::transpose(m);
+    float32x4_t row3 = mt.m4x4()[3];
+    for (int i=0; i<3; ++i) {
+        float32x4_t t = vdupq_n_f32(float(dist[i]));
+        mt.m4x4()[i] = vaddq_f32(mt.m4x4()[i], vmulq_f32(t, row3));
+    }
+    m = simd4x4::transpose(mt);
+}
+
+template<typename S>
+inline void post_translate(simd4x4_t<float,4>& m, const simd4_t<S,3>& dist)
+{
+    float32x4_t col3 = m.m4x4()[3];
+    for (int i=0; i<3; ++i) {
+        float32x4_t t = vdupq_n_f32(float(dist[i]));
+        col3 = vaddq_f32(col3, vmulq_f32(t, m.m4x4()[i]));
+    }
+    m.m4x4()[3] = col3;
+}
+
+template<>
+inline simd4_t<float,3> transform<float>(const simd4x4_t<float,4>& m, const simd4_t<float,3>& pt) {
+    float32x4_t tpt = m.m4x4()[3];
+    for (int i=0; i<3; ++i) {
+        float32x4_t ptd = vdupq_n_f32(pt[i]);
+        tpt = vaddq_f32(tpt, vmulq_f32(ptd, m.m4x4()[i]));
+    }
+    vsetq_lane_f32(0.0f, tpt, 3);
+    return tpt;
+}
+
+} /* namespace simd4x */
+
+
+
+#if 0
+
+template<>
+class simd4x4_t<double,4>
+{
+private:
+    typedef double  __mtx4d_t[4][4];
+
+    union ALIGN32 {
+        __m256d simd4x4[4];
+        __mtx4d_t mtx;
+        double array[4*4];
+    } ALIGN32C;
+
+public:
+    simd4x4_t(void) {}
+    simd4x4_t(double m00, double m01, double m02, double m03,
+              double m10, double m11, double m12, double m13,
+              double m20, double m21, double m22, double m23,
+              double m30, double m31, double m32, double m33)
+    {
+        simd4x4[0] = _mm256_set_pd(m30,m20,m10,m00);
+        simd4x4[1] = _mm256_set_pd(m31,m21,m11,m01);
+        simd4x4[2] = _mm256_set_pd(m32,m22,m12,m02);
+        simd4x4[3] = _mm256_set_pd(m33,m23,m13,m03);
+    }
+    explicit simd4x4_t(const double m[4*4]) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = simd4_t<double,4>((const double*)&m[4*i]).v4();
+        }
+    }
+
+    explicit simd4x4_t(const __mtx4d_t m) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = simd4_t<double,4>(m[i]).v4();
+        }
+    }
+    simd4x4_t(const simd4x4_t<double,4>& m) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = m.m4x4()[i];
+        }
+    }
+    ~simd4x4_t(void) {}
+
+    inline __m256d (&m4x4(void))[4] {
+        return simd4x4;
+    }
+
+    inline const __m256d (&m4x4(void) const)[4] {
+        return simd4x4;
+    }
+
+    inline const double (&ptr(void) const)[4][4] {
+        return mtx;
+    }
+
+    inline double (&ptr(void))[4][4] {
+        return mtx;
+    }
+
+    inline operator const double*(void) const {
+        return array;
+    }
+
+    inline operator double*(void) {
+        return array;
+    }
+
+    template<int M>
+    inline void set(int i, const simd4_t<double,M>& v) {
+        simd4x4[i] = v.v4();
+    }
+
+    inline simd4x4_t<double,4>& operator=(const __mtx4d_t m) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = simd4_t<double,4>(m[i]).v4();
+        }
+        return *this;
+    }
+    inline simd4x4_t<double,4>& operator=(const simd4x4_t<double,4>& m) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = m.m4x4()[i];
+        }
+        return *this;
+    }
+
+    inline simd4x4_t<double,4>& operator+=(const simd4x4_t<double,4>& m) {
+        for (int i=0; i<4; ++i) {
+           simd4x4[i] = _mm256_add_pd(simd4x4[i], m.m4x4()[i]);
+        }
+        return *this;
+    }
+
+    inline simd4x4_t<double,4>& operator-=(const simd4x4_t<double,4>& m) {
+        for (int i=0; i<4; ++i) {
+           simd4x4[i] = _mm256_sub_pd(simd4x4[i], m.m4x4()[i]);
+        }
+        return *this;
+    }
+
+    inline simd4x4_t<double,4>& operator*=(double f) {
+        simd4_t<double,4> f4(f);
+        for (int i=0; i<4; ++i) {
+           simd4x4[i] = _mm256_mul_pd(simd4x4[i], f4.v4());
+        }
+        return *this;
+    }
+
+    simd4x4_t<double,4>& operator*=(const simd4x4_t<double,4>& m2) {
+        simd4x4_t<double,4> m1 = *this;
+        __m256d row, col;
+        for (int i=0; i<4; ++i ) {
+            col = _mm256_set1_pd(m2.ptr()[i][0]);
+            row = _mm256_mul_pd(m1.m4x4()[0], col);
+            for (int j=1; j<4; ++j) {
+                col = _mm256_set1_pd(m2.ptr()[i][j]);
+                row = _mm256_add_pd(row, _mm256_mul_pd(m1.m4x4()[j], col));
+            }
+            simd4x4[i] = row;
+        }
+        return *this;
+    }
+
+};
+
+template<int M>
+inline simd4_t<double,M> operator*(const simd4x4_t<double,4>& m, const simd4_t<double,M>& vi)
+{
+    __m256d mv = _mm256_mul_pd(m.m4x4()[0], _mm256_set1_pd(vi.ptr()[0]));
+    for (int i=1; i<M; ++i) {
+        __m256d row = _mm256_mul_pd(m.m4x4()[i], _mm256_set1_pd(vi.ptr()[i]));
+        mv = _mm256_add_pd(mv, row);
+    }
+    return mv;
+}
+
+namespace simd4x4
+{
+
+template<>
+inline simd4x4_t<double,4> rotation_matrix<double>(double angle, const simd4_t<double,3>& axis)
+{
+    double s = std::sin(angle), c = std::cos(angle), t = 1.0-c;
+    simd4_t<double,3> axt, at = axis*t, as = axis*s;
+    simd4x4_t<double,4> m;
+
+    simd4x4::unit(m);
+    axt = axis.ptr()[0]*at;
+    m.m4x4()[0] = axt.v4();
+
+    axt = axis.ptr()[1]*at;
+    m.ptr()[0][0] += c;
+    m.ptr()[0][1] += as.ptr()[2];
+    m.ptr()[0][2] -= as.ptr()[1];
+
+    m.m4x4()[1] = axt.v4();
+
+    axt = axis.ptr()[2]*at;
+    m.ptr()[1][0] -= as.ptr()[2];
+    m.ptr()[1][1] += c;
+    m.ptr()[1][2] += as.ptr()[0];
+
+    m.m4x4()[2] = axt.v4();
+
+    m.ptr()[2][0] += as.ptr()[1];
+    m.ptr()[2][1] -= as.ptr()[0];
+    m.ptr()[2][2] += c;
+
+    return m;
+}
+
+template<>
+inline simd4x4_t<double,4> transpose<double>(simd4x4_t<double,4> m) {
+// http://stackoverflow.com/questions/36167517/m256d-transpose4-equivalent
+    __m256d tmp0 = _mm256_shuffle_pd(m.m4x4()[0], m.m4x4()[1], 0x0);
+    __m256d tmp2 = _mm256_shuffle_pd(m.m4x4()[0], m.m4x4()[1], 0xF);
+    __m256d tmp1 = _mm256_shuffle_pd(m.m4x4()[2], m.m4x4()[3], 0x0);
+    __m256d tmp3 = _mm256_shuffle_pd(m.m4x4()[2], m.m4x4()[3], 0xF);
+
+    m.m4x4()[0] = _mm256_permute2f128_pd(tmp0, tmp1, 0x20);
+    m.m4x4()[1] = _mm256_permute2f128_pd(tmp2, tmp3, 0x20);
+    m.m4x4()[2] = _mm256_permute2f128_pd(tmp0, tmp1, 0x31);
+    m.m4x4()[3] = _mm256_permute2f128_pd(tmp2, tmp3, 0x31);
+
+    return m;
+}
+
+inline void translate(simd4x4_t<double,4>& m, const simd4_t<double,3>& dist) {
+    m.m4x4()[3] = _mm256_sub_pd(m.m4x4()[3], dist.v4());
+}
+
+template<typename S>
+inline void pre_translate(simd4x4_t<double,4>& m, const simd4_t<S,3>& dist)
+{
+    simd4_t<double,4> row3(m.ptr()[0][3],m.ptr()[1][3],m.ptr()[2][3],m.ptr()[3][3]);
+    for (int i=0; i<3; ++i) {
+        for (int j=0; j<4; ++j) {
+            m.ptr()[j][i] += row3[j]*double(dist[i]);
+        }
+    }
+#if 0
+    // this is slower
+    simd4x4_t<double,4> mt = simd4x4::transpose(m);
+    __mm256d row3 = mt.m4x4()[3];
+    for (int i=0; i<3; ++i) {
+        __mm256d _mm256_set1_pd(float(dist[i]));
+        mt.m4x4()[i] = _mm256_add_pd(mt.m4x4()[i], _mm256_mul_pd(t, row3));
+    }
+    m = simd4x4::transpose(mt);
+#endif
+}
+
+template<typename S>
+inline void post_translate(simd4x4_t<double,4>& m, const simd4_t<S,3>& dist) {
+    __m256d col3 = m.m4x4()[3];
+    for (int i=0; i<3; ++i) {
+        __m256d t = _mm256_set1_pd(double(dist[i]));
+        col3 = _mm256_add_pd(col3, _mm256_mul_pd(t, m.m4x4()[i]));
+    }
+    m.m4x4()[3] = col3;
+}
+
+template<>
+inline simd4_t<double,3> transform<double>(const simd4x4_t<double,4>& m, const simd4_t<double,3>& pt) {
+    simd4_t<double,3> res;
+    __m256d tpt = m.m4x4()[3];
+    for (int i=0; i<3; ++i) {
+        __m256d ptd = _mm256_set1_pd(pt[i]);
+        tpt = _mm256_add_pd(tpt, _mm256_mul_pd(ptd, m.m4x4()[i]));
+    }
+    res = tpt;
+    res[3] = 0.0;
+    return res;
+}
+
+} /* namespace simd4x4 */
+# endif
+
+
+template<>
+class simd4x4_t<int,4>
+{
+private:
+    typedef int  __mtx4i_t[4][4];
+
+    union ALIGN16 {
+        int32x4_t simd4x4[4];
+        __mtx4i_t mtx;
+        int array[4*4];
+    } ALIGN16C;
+
+public:
+    simd4x4_t(void) {}
+    simd4x4_t(int m00, int m01, int m02, int m03,
+              int m10, int m11, int m12, int m13,
+              int m20, int m21, int m22, int m23,
+              int m30, int m31, int m32, int m33)
+    {
+        array[0] = m00; array[1] = m10;
+        array[2] = m20; array[3] = m30;
+        array[4] = m01; array[5] = m11;
+        array[6] = m21; array[7] = m31;
+        array[8] = m02; array[9] = m12;
+        array[10] = m22; array[11] = m32;
+        array[12] = m03; array[13] = m13;
+        array[14] = m23; array[15] = m33;
+    }
+    simd4x4_t(const int m[4*4]) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = simd4_t<int,4>((const int*)&m[4*i]).v4();
+        }
+    }
+    explicit simd4x4_t(const __mtx4i_t m) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = simd4_t<int,4>(m[i]).v4();
+        }
+    }
+    simd4x4_t(const simd4x4_t<int,4>& m) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = m.m4x4()[i];
+        }
+    }
+    ~simd4x4_t(void) {}
+
+    inline int32x4_t (&m4x4(void))[4] {
+        return simd4x4;
+    }
+
+    inline const int32x4_t (&m4x4(void) const)[4] {
+        return simd4x4;
+    }
+    
+    inline const int (&ptr(void) const)[4][4] {
+        return mtx;
+    }
+
+    inline int (&ptr(void))[4][4] {
+        return mtx;
+    }
+
+    inline operator const int*(void) const {
+        return array;
+    }
+
+    inline operator int*(void) {
+        return array;
+    }
+
+    template<int M>
+    inline void set(int i, const simd4_t<int,M>& v) {
+        simd4x4[i] = v.v4();
+    }
+
+    inline simd4x4_t<int,4>& operator=(const __mtx4i_t m) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = simd4_t<int,4>(m[i]).v4();
+        }
+        return *this;
+    }
+    inline simd4x4_t<int,4>& operator=(const simd4x4_t<int,4>& m) {
+        for (int i=0; i<4; ++i) {
+            simd4x4[i] = m.m4x4()[i];
+        }
+        return *this;
+    }
+
+    inline simd4x4_t<int,4>& operator+=(const simd4x4_t<int,4>& m) {
+        for (int i=0; i<4; ++i) {
+           simd4x4[i] += m.m4x4()[i];
+        }
+        return *this;
+    }
+
+    inline simd4x4_t<int,4>& operator-=(const simd4x4_t<int,4>& m) {
+        for (int i=0; i<4; ++i) {
+           simd4x4[i] -= m.m4x4()[i];
+        }
+        return *this;
+    }
+
+    inline simd4x4_t<int,4>& operator*=(int f) {
+        simd4_t<int,4> f4(f);
+        for (int i=0; i<4; ++i) {
+           simd4x4[i] *= f4.v4();
+        }
+        return *this;
+    }
+
+    simd4x4_t<int,4>& operator*=(const simd4x4_t<int,4>& m2) {
+        simd4x4_t<int,4> m1 = *this;
+        simd4_t<int,4> row, col;
+
+        for (int i=0; i<4; ++i) {
+            simd4_t<int,4> col(m2.ptr()[i][0]);
+            row.v4() = m1.m4x4()[0] * col.v4();
+            for (int j=1; j<4; ++j) {
+                simd4_t<int,4> col(m2.ptr()[i][j]);
+                row.v4() += m1.m4x4()[j] * col.v4();
+            }
+            simd4x4[i] = row.v4();
+        }
+        return *this;
+    }
+};
+
+template<int M>
+inline simd4_t<int,M> operator*(const simd4x4_t<int,4>& m, const simd4_t<int,M>& vi)
+{
+    simd4_t<int,M> mv(m.m4x4()[0]);
+    mv *= vi.ptr()[0];
+    for (int i=1; i<M; ++i) {
+        simd4_t<int,4> row(m.m4x4()[i]);
+        row *= vi.ptr()[i];
+        mv.v4() += row.v4();
+    }
+    for (int i=M; i<4; ++i) mv[i] = 0;
+    return mv;
+}
+
+template<>
+inline simd4x4_t<int,4> operator*(const simd4x4_t<int,4>& m1, const simd4x4_t<int,4>& m2)
+{
+    simd4_t<int,4> row, col;
+    simd4x4_t<int,4> m;
+
+    for (int i=0; i<4; ++i) {
+        simd4_t<int,4> col(m2.ptr()[i][0]);
+        row.v4() = m1.m4x4()[0] * col.v4();
+        for (int j=1; j<4; ++j) {
+            simd4_t<int,4> col(m2.ptr()[i][j]);
+            row.v4() += m1.m4x4()[j] * col.v4();
+        }
+        m.m4x4()[i] = row.v4();
+    }
+
+    return m;
+}
+
+namespace simd4x4
+{
+
+template<>
+inline simd4x4_t<int,4> transpose<int>(simd4x4_t<int,4> m) {
+    int32x4x4_t x = vld4q_s32(m);
+    vst1q_s32(&m[0], x.val[0]);
+    vst1q_s32(&m[4], x.val[1]);
+    vst1q_s32(&m[8], x.val[2]);
+    vst1q_s32(&m[12], x.val[3]);
+    return m;
+}
+
+inline void translate(simd4x4_t<int,4>& m, const simd4_t<int,3>& dist) {
+    m.m4x4()[3] = vsubq_s32(m.m4x4()[3], dist.v4());
+}
+
+template<typename S>
+inline void pre_translate(simd4x4_t<int,4>& m, const simd4_t<S,3>& dist)
+{
+    simd4x4_t<int,4> mt = simd4x4::transpose(m);
+    simd4_t<int,4> row3(mt.ptr()[3]);
+    for (int i=0; i<3; ++i) {
+        simd4_t<int,4> trow3 = int(dist[i]);
+        trow3 *= row3.v4();
+        mt.m4x4()[i] = vaddq_s32(mt.m4x4()[i], trow3.v4());
+    }
+    m = simd4x4::transpose(mt);
+}
+
+template<typename S>
+inline void post_translate(simd4x4_t<int,4>& m, const simd4_t<S,3>& dist)
+{
+    int32x4_t col3 = m.m4x4()[3];
+    for (int i=0; i<3; ++i) {
+        simd4_t<int,4> trow3 = int(dist[i]);
+        trow3 *= m.m4x4()[i];
+        col3 = vaddq_s32(col3, trow3.v4());
+    }
+    m.m4x4()[3] = col3;
+}
+
+template<>
+inline simd4_t<int,3> transform<int>(const simd4x4_t<int,4>& m, const simd4_t<int,3>& pt) {
+    simd4_t<int,3> tpt = m.m4x4()[3];
+    for (int i=0; i<3; ++i) {
+        simd4_t<int,3> ptd = m.m4x4()[i];
+        ptd *= pt[i];
+        tpt.v4() = vaddq_s32(tpt.v4(), ptd.v4());
+    }
+    tpt[3] = 0.0;
+    return tpt;
+}
+
+
+} /* namespace simd4x */
+#endif
+
+#endif /* __SIMD4X4_NEON_H__ */
+
--- a/simgear/math/simd_neon.hxx
+++ b/simgear/math/simd_neon.hxx
@@ -40,23 +40,22 @@ class simd4_t<float,N>
 private:
   typedef float  __vec4f_t[N];

-    union {
+    union ALIGN16 {
        float32x4_t simd4;
        __vec4f_t vec;
        float _v4[4];
-    };
+    } ALIGN16C;

 public:
    simd4_t(void) {}
    simd4_t(float f) {
-        simd4 = vdupq_n_f3(f);
+        simd4 = vdupq_n_f32(f);
        for (int i=N; i<4; ++i) _v4[i] = 0.0f;
    }
    simd4_t(float x, float y) : simd4_t(x,y,0,0) {}
    simd4_t(float x, float y, float z) : simd4_t(x,y,z,0) {}
    simd4_t(float x, float y, float z, float w) {
-        ALIGN16 float ALIGN16C data[4] = { x, y, z, w };
-        simd4 = vld1q_f32(data);
+        _v4[0] = x; _v4[1] = y; _v4[2] = z; _v4[3] = w;
    }
    simd4_t(const __vec4f_t v) {
        simd4 = vld1q_f32(v);
@@ -91,7 +90,7 @@ public:
    }

    inline simd4_t<float,N>& operator=(float f) {
-        simd4 = vdupq_n_f3(f);
+        simd4 = vdupq_n_f32(f);
        for (int i=N; i<4; ++i) _v4[i] = 0.0f;
        return *this;
    }
@@ -142,7 +141,7 @@ public:
    template<int M>
    inline simd4_t<float,N>& operator/=(const simd4_t<float,M>& v) {
 // http://stackoverflow.com/questions/6759897/how-to-divide-in-neon-intrinsics-by-a-float-number
-        float32x2_t recip = vrecpeq_f32(v.v4());
+        float32x4_t recip = vrecpeq_f32(v.v4());
        recip = vmulq_f32(vrecpsq_f32(v.v4(), recip), recip);
        recip = vmulq_f32(vrecpsq_f32(v.v4(), recip), recip);
        simd4 = vmulq_f32(simd4, recip);
@@ -168,12 +167,12 @@ inline float dot(simd4_t<float,4> v1, const simd4_t<float,4>& v2) {
    return hsum_float32x4_neon(v1.v4()*v2.v4());
 }

-// https://github.com/scoopr/vectorial/blob/master/include/vectorial/float32x4_t_neon.h
+// https://github.com/scoopr/vectorial/blob/master/include/vectorial/simd4f_neon.h
 template<>
 inline simd4_t<float,3> cross(const simd4_t<float,3>& v1, const simd4_t<float,3>& v2)
 {
-    static const uint32_t mask[] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0};
-    static const int32x4_t mask = vld1q_s32((const int32_t*)mask);
+    static const uint32_t mask_a[] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0};
+    static const int32x4_t mask = vld1q_s32((const int32_t*)mask_a);

    // Compute v1 and v2 in order yzx
    float32x2_t v1_low = vget_low_f32(v1.v4());
@@ -181,13 +180,14 @@ inline simd4_t<float,3> cross(const simd4_t<float,3>& v1, const simd4_t<float,3>
    float32x4_t v1_yzx = vcombine_f32(vext_f32(v1_low, vget_high_f32(v1.v4()),1), v1_low);
    float32x4_t v2_yzx = vcombine_f32(vext_f32(v2_low, vget_high_f32(v2.v4()),1), v2_low);
    // Compute cross in order zxy
-    float32x4_t s3 = float32x4_t_sub(float32x4_t_mul(v2_yzx, v1.v4()), float32x4_t_mul(v1_yzx, v2.v4()));
+    float32x4_t s3 = vsubq_f32(vmulq_f32(v2_yzx, v1.v4()), vmulq_f32(v1_yzx, v2.v4()));
    // Permute cross to order xyz and zero out the fourth value
    float32x2_t low = vget_low_f32(s3);
    s3 = vcombine_f32(vext_f32(low, vget_high_f32(s3), 1), low);
    return (float32x4_t)vandq_s32((int32x4_t)s3,mask);
 }

+
 template<int N>
 inline simd4_t<float,N> min(simd4_t<float,N> v1, const simd4_t<float,N>& v2) {
    v1 = vminq_f32(v1.v4(), v2.v4());
@@ -386,11 +386,11 @@ class simd4_t<int,N>
 private:
   typedef int  __vec4i_t[N];

-    union {
+    union ALIGN16 {
        int32x4_t simd4;
        __vec4i_t vec;
        int _v4[4];
-    };
+    } ALIGN16C;

 public:
    simd4_t(void) {}
@@ -401,11 +401,10 @@ public:
    simd4_t(int x, int y) : simd4_t(x,y,0,0) {}
    simd4_t(int x, int y, int z) : simd4_t(x,y,z,0) {}
    simd4_t(int x, int y, int z, int w) {
-        ALIGN16 int32_t ALIGN16C data[4] = { x, y, z, w };
-        simd4 = vld1q_s32(data);
+        _v4[0] = x; _v4[1] = y; _v4[2] = z; _v4[3] = w;
    }
    simd4_t(const __vec4i_t v) {
-        simd4 = vld1q_s32((int32x4_t*)v);
+        simd4 = vld1q_s32(v);
        for (int i=N; i<4; ++i) _v4[i] = 0;
    }
    simd4_t(const simd4_t<int,N>& v) {
@@ -445,7 +444,7 @@ public:
        return *this;
    }
    inline simd4_t<int,N>& operator=(const __vec4i_t v) {
-        simd4 = vld1q_s32((int32x4_t*)v);
+        simd4 = vld1q_s32(v);
        for (int i=N; i<4; ++i) _v4[i] = 0;
        return *this;
    }
@@ -484,7 +483,7 @@ public:
         return operator*=(v.v4());
    }
    inline simd4_t<int,N>& operator*=(const int32x4_t& v) {
-        simd4 = vmulq_s32(simd4, v.v4());
+        simd4 = vmulq_s32(simd4, v);
        return *this;
    }