Files
meta-musicians/recipes-musicians/ardour/ardour5/0002-Use-ARM-NEON-intrinsics-if-available-for-mixing-func.patch
Andreas Müller 948e5f2d3d ardour: python3-/out-of-tree-build
Signed-off-by: Andreas Müller <schnitzeltony@gmail.com>
2020-01-25 23:42:49 +01:00

230 lines
6.1 KiB
Diff

From 610ce4e19b0b39d0e8391057b22163d4fdc7bdb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?= <schnitzeltony@googlemail.com>
Date: Mon, 3 Jul 2017 23:24:55 +0200
Subject: [PATCH 2/2] Use ARM NEON intrinsics if available for mixing functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Andreas Müller <schnitzeltony@googlemail.com>
---
libs/ardour/ardour/mix.h | 10 +++
libs/ardour/globals.cc | 13 ++++
libs/ardour/mix.cc | 157 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 180 insertions(+)
diff --git a/libs/ardour/ardour/mix.h b/libs/ardour/ardour/mix.h
index 4676c01..55919f0 100644
--- a/libs/ardour/ardour/mix.h
+++ b/libs/ardour/ardour/mix.h
@@ -65,6 +65,16 @@ LIBARDOUR_API void veclib_mix_buffers_no_gain (ARDOUR::Sample * dst, cons
#endif
+#if defined (__ARM_NEON__)
+
+LIBARDOUR_API float neon_compute_peak (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current);
+LIBARDOUR_API void neon_find_peaks (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
+LIBARDOUR_API void neon_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
+LIBARDOUR_API void neon_mix_buffers_with_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
+LIBARDOUR_API void neon_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+
+#endif
+
/* non-optimized functions */
LIBARDOUR_API float default_compute_peak (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current);
diff --git a/libs/ardour/globals.cc b/libs/ardour/globals.cc
index 28eb818..d562b35 100644
--- a/libs/ardour/globals.cc
+++ b/libs/ardour/globals.cc
@@ -222,6 +222,19 @@ setup_hardware_optimization (bool try_optimization)
info << "Apple VecLib H/W specific optimizations in use" << endmsg;
}
+#elif defined (__ARM_NEON__)
+ // No runtime detection
+ compute_peak = neon_compute_peak;
+ find_peaks = neon_find_peaks;
+ apply_gain_to_buffer = neon_apply_gain_to_buffer;
+ mix_buffers_with_gain = neon_mix_buffers_with_gain;
+ mix_buffers_no_gain = neon_mix_buffers_no_gain;
+ copy_vector = default_copy_vector;
+
+ generic_mix_functions = false;
+
+ info << "ARM NEON optimizations in use" << endmsg;
+
#endif
/* consider FPU denormal handling to be "h/w optimization" */
diff --git a/libs/ardour/mix.cc b/libs/ardour/mix.cc
index 96ae624..d1a46a2 100644
--- a/libs/ardour/mix.cc
+++ b/libs/ardour/mix.cc
@@ -182,4 +182,161 @@ veclib_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, pf
#endif
+#if defined (__ARM_NEON__)
+#include <arm_neon.h>
+
+float
+neon_compute_peak (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current)
+{
+ float32x4_t v4current = vdupq_n_f32(current);
+ float32x4_t v4work;
+ float32x2_t v2current;
+
+ // unaligned lead (spread single float to vector)
+ while (((intptr_t)buf) % 16 != 0 && nsamples > 0) {
+ v4work = vdupq_n_f32(*buf);
+ v4current = vmaxq_f32(v4current, vabsq_f32(v4work));
+
+ buf++;
+ nsamples--;
+ }
+ // aligned
+ while (nsamples >= 4) {
+ v4work = vld1q_f32(buf);
+ v4current = vmaxq_f32(v4current, vabsq_f32(v4work));
+
+ buf+=4;
+ nsamples-=4;
+ }
+ // rest < 4 (spread single float to vector)
+ while (nsamples > 0) {
+ v4work = vdupq_n_f32(*buf);
+ v4current = vmaxq_f32(v4current, vabsq_f32(v4work));
+
+ buf++;
+ nsamples--;
+ }
+
+ // calc max in vector by pairwise max done twice
+ v2current = vpmax_f32(vget_low_f32(v4current), vget_high_f32(v4current));
+ v2current = vpmax_f32(v2current, v2current);
+ return vget_lane_f32(v2current, 0);
+}
+
+void
+neon_find_peaks (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max)
+{
+ float32x4_t v4min = vdupq_n_f32(*min);
+ float32x4_t v4max = vdupq_n_f32(*max);
+ float32x4_t v4work;
+ float32x2_t v2min, v2max;
+
+ // vector
+ while (nsamples >= 4) {
+ v4work = vld1q_f32(buf);
+ v4min = vminq_f32(v4min, v4work);
+ v4max = vmaxq_f32(v4max, v4work);
+
+ buf+=4;
+ nsamples-=4;
+ }
+ // rest < 4 (spread single float to vector)
+ while (nsamples > 0) {
+ v4work = vdupq_n_f32(*buf);
+ v4min = vminq_f32(v4min, v4work);
+ v4max = vmaxq_f32(v4max, v4work);
+
+ buf++;
+ nsamples--;
+ }
+
+ // calc min in vector by pairwise max done twice
+ v2min = vpmin_f32(vget_low_f32(v4min), vget_high_f32(v4min));
+ v2min = vpmax_f32(v2min, v2min);
+ *min = vget_lane_f32(v2min, 0);
+ // calc max in vector by pairwise max done twice
+ v2max = vpmax_f32(vget_low_f32(v4max), vget_high_f32(v4max));
+ v2max = vpmax_f32(v2max, v2max);
+ *max = vget_lane_f32(v2max, 0);
+}
+
+void
+neon_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain)
+{
+ float32x4_t v4gain = vdupq_n_f32(gain);
+ float32x4_t v4work;
+
+ // vector
+ while (nframes >= 4) {
+ v4work = vld1q_f32(buf);
+ v4work = vmulq_f32(v4work, v4gain);
+ vst1q_f32(buf, v4work);
+
+ buf+=4;
+ nframes-=4;
+ }
+ // rest < 4
+ while (nframes > 0) {
+ *buf *= gain;
+
+ buf++;
+ nframes--;
+ }
+}
+
+void
+neon_mix_buffers_with_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain)
+{
+ float32x4_t v4gain = vdupq_n_f32(gain);
+ float32x4_t v4src, v4dst;
+
+ // vector
+ while (nframes >= 4) {
+ v4src = vld1q_f32(src);
+ v4dst = vld1q_f32(dst);
+ // v4dst = v4dst + v4src * v4gain
+ v4dst = vmlaq_f32(v4dst, v4src, v4gain);
+ vst1q_f32(dst, v4dst);
+
+ src+=4;
+ dst+=4;
+ nframes-=4;
+ }
+ // rest < 4
+ while (nframes > 0) {
+ *dst += *src * gain;
+
+ src++;
+ dst++;
+ nframes--;
+ }
+}
+
+void
+neon_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes)
+{
+ float32x4_t v4src, v4dst;
+
+ // vector
+ while (nframes >= 4) {
+ v4src = vld1q_f32(src);
+ v4dst = vld1q_f32(dst);
+ v4dst = vaddq_f32(v4dst, v4src);
+ vst1q_f32(dst, v4dst);
+
+ src+=4;
+ dst+=4;
+ nframes-=4;
+ }
+ // rest < 4
+ while (nframes > 0) {
+ *dst += *src;
+
+ src++;
+ dst++;
+ nframes--;
+ }
+}
+
+#endif
--
2.9.4