From 610ce4e19b0b39d0e8391057b22163d4fdc7bdb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?= <schnitzeltony@googlemail.com>
Date: Mon, 3 Jul 2017 23:24:55 +0200
Subject: [PATCH 2/2] Use ARM NEON intrinsics if available for mixing functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Andreas Müller <schnitzeltony@googlemail.com>
---
 libs/ardour/ardour/mix.h |  10 +++
 libs/ardour/globals.cc   |  13 ++++
 libs/ardour/mix.cc       | 157 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 180 insertions(+)

diff --git a/libs/ardour/ardour/mix.h b/libs/ardour/ardour/mix.h
index 4676c01..55919f0 100644
--- a/libs/ardour/ardour/mix.h
+++ b/libs/ardour/ardour/mix.h
@@ -65,6 +65,16 @@ LIBARDOUR_API void  veclib_mix_buffers_no_gain       (ARDOUR::Sample * dst, cons
 
 #endif
 
+#if defined (__ARM_NEON__)
+
+LIBARDOUR_API float neon_compute_peak                (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current);
+LIBARDOUR_API void neon_find_peaks                   (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
+LIBARDOUR_API void  neon_apply_gain_to_buffer        (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
+LIBARDOUR_API void  neon_mix_buffers_with_gain       (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
+LIBARDOUR_API void  neon_mix_buffers_no_gain         (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+
+#endif
+
 /* non-optimized functions */
 
 LIBARDOUR_API float default_compute_peak              (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current);
diff --git a/libs/ardour/globals.cc b/libs/ardour/globals.cc
index 28eb818..d562b35 100644
--- a/libs/ardour/globals.cc
+++ b/libs/ardour/globals.cc
@@ -222,6 +222,19 @@ setup_hardware_optimization (bool try_optimization)
 
 			info << "Apple VecLib H/W specific optimizations in use" << endmsg;
 		}
+#elif defined (__ARM_NEON__)
+		// No runtime detection
+		compute_peak           = neon_compute_peak;
+		find_peaks             = neon_find_peaks;
+		apply_gain_to_buffer   = neon_apply_gain_to_buffer;
+		mix_buffers_with_gain  = neon_mix_buffers_with_gain;
+		mix_buffers_no_gain    = neon_mix_buffers_no_gain;
+		copy_vector            = default_copy_vector;
+
+		generic_mix_functions = false;
+
+		info << "ARM NEON optimizations in use" << endmsg;
+
 #endif
 
 		/* consider FPU denormal handling to be "h/w optimization" */
diff --git a/libs/ardour/mix.cc b/libs/ardour/mix.cc
index 96ae624..d1a46a2 100644
--- a/libs/ardour/mix.cc
+++ b/libs/ardour/mix.cc
@@ -182,4 +182,161 @@ veclib_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, pf
 
 #endif
 
+#if defined (__ARM_NEON__)
+#include <arm_neon.h>
+
+float
+neon_compute_peak (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current)
+{
+	float32x4_t v4current = vdupq_n_f32(current);
+	float32x4_t v4work;
+	float32x2_t v2current;
+
+	// unaligned lead (spread single float to vector)
+	while (((intptr_t)buf) % 16 != 0 && nsamples > 0) {
+		v4work = vdupq_n_f32(*buf);
+		v4current = vmaxq_f32(v4current, vabsq_f32(v4work));
+
+		buf++;
+		nsamples--;
+	}
+	// aligned
+	while (nsamples >= 4) {
+		v4work = vld1q_f32(buf);
+		v4current = vmaxq_f32(v4current, vabsq_f32(v4work));
+
+		buf+=4;
+		nsamples-=4;
+	}
+	// rest < 4 (spread single float to vector)
+	while (nsamples > 0) {
+		v4work = vdupq_n_f32(*buf);
+		v4current = vmaxq_f32(v4current, vabsq_f32(v4work));
+
+		buf++;
+		nsamples--;
+	}
+
+	// calc max in vector by pairwise max done twice
+	v2current = vpmax_f32(vget_low_f32(v4current), vget_high_f32(v4current));
+	v2current = vpmax_f32(v2current, v2current);
+	return vget_lane_f32(v2current, 0);
+}
+
+void
+neon_find_peaks (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max)
+{
+	float32x4_t v4min = vdupq_n_f32(*min);
+	float32x4_t v4max = vdupq_n_f32(*max);
+	float32x4_t v4work;
+	float32x2_t v2min, v2max;
+
+	// vector
+	while (nsamples >= 4) {
+		v4work = vld1q_f32(buf);
+		v4min = vminq_f32(v4min, v4work);
+		v4max = vmaxq_f32(v4max, v4work);
+
+		buf+=4;
+		nsamples-=4;
+	}
+	// rest < 4 (spread single float to vector)
+	while (nsamples > 0) {
+		v4work = vdupq_n_f32(*buf);
+		v4min = vminq_f32(v4min, v4work);
+		v4max = vmaxq_f32(v4max, v4work);
+
+		buf++;
+		nsamples--;
+	}
+
+	// calc min in vector by pairwise max done twice
+	v2min = vpmin_f32(vget_low_f32(v4min), vget_high_f32(v4min));
+	v2min = vpmax_f32(v2min, v2min);
+	*min = vget_lane_f32(v2min, 0);
+	// calc max in vector by pairwise max done twice
+	v2max = vpmax_f32(vget_low_f32(v4max), vget_high_f32(v4max));
+	v2max = vpmax_f32(v2max, v2max);
+	*max = vget_lane_f32(v2max, 0);
+}
+
+void
+neon_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain)
+{
+	float32x4_t v4gain = vdupq_n_f32(gain);
+	float32x4_t v4work;
+
+	// vector
+	while (nframes >= 4) {
+		v4work = vld1q_f32(buf);
+		v4work = vmulq_f32(v4work, v4gain);
+		vst1q_f32(buf, v4work);
+
+		buf+=4;
+		nframes-=4;
+	}
+	// rest < 4
+	while (nframes > 0) {
+		*buf *= gain;
+
+		buf++;
+		nframes--;
+	}
+}
+
+void
+neon_mix_buffers_with_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain)
+{
+	float32x4_t v4gain = vdupq_n_f32(gain);
+	float32x4_t v4src, v4dst;
+
+	// vector
+	while (nframes >= 4) {
+		v4src = vld1q_f32(src);
+		v4dst = vld1q_f32(dst);
+		// v4dst = v4dst + v4src * v4gain
+		v4dst = vmlaq_f32(v4dst, v4src, v4gain);
+		vst1q_f32(dst, v4dst);
+
+		src+=4;
+		dst+=4;
+		nframes-=4;
+	}
+	// rest < 4
+	while (nframes > 0) {
+		*dst += *src * gain;
+
+		src++;
+		dst++;
+		nframes--;
+	}
+}
+
+void
+neon_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes)
+{
+	float32x4_t v4src, v4dst;
+
+	// vector
+	while (nframes >= 4) {
+		v4src = vld1q_f32(src);
+		v4dst = vld1q_f32(dst);
+		v4dst = vaddq_f32(v4dst, v4src);
+		vst1q_f32(dst, v4dst);
+
+		src+=4;
+		dst+=4;
+		nframes-=4;
+	}
+	// rest < 4
+	while (nframes > 0) {
+		*dst += *src;
+
+		src++;
+		dst++;
+		nframes--;
+	}
+}
+
+#endif
 
-- 
2.9.4