mirror of
https://github.com/schnitzeltony/meta-musicians.git
synced 2026-01-29 17:18:41 +01:00
230 lines
6.1 KiB
Diff
230 lines
6.1 KiB
Diff
From 610ce4e19b0b39d0e8391057b22163d4fdc7bdb4 Mon Sep 17 00:00:00 2001
|
|
From: =?UTF-8?q?Andreas=20M=C3=BCller?= <schnitzeltony@googlemail.com>
|
|
Date: Mon, 3 Jul 2017 23:24:55 +0200
|
|
Subject: [PATCH 2/2] Use ARM NEON intrinsics if available for mixing functions
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Signed-off-by: Andreas Müller <schnitzeltony@googlemail.com>
|
|
---
|
|
libs/ardour/ardour/mix.h | 10 +++
|
|
libs/ardour/globals.cc | 13 ++++
|
|
libs/ardour/mix.cc | 157 +++++++++++++++++++++++++++++++++++++++++++++++
|
|
3 files changed, 180 insertions(+)
|
|
|
|
diff --git a/libs/ardour/ardour/mix.h b/libs/ardour/ardour/mix.h
|
|
index 4676c01..55919f0 100644
|
|
--- a/libs/ardour/ardour/mix.h
|
|
+++ b/libs/ardour/ardour/mix.h
|
|
@@ -65,6 +65,16 @@ LIBARDOUR_API void veclib_mix_buffers_no_gain (ARDOUR::Sample * dst, cons
|
|
|
|
#endif
|
|
|
|
+#if defined (__ARM_NEON__)
|
|
+
|
|
+LIBARDOUR_API float neon_compute_peak (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current);
|
|
+LIBARDOUR_API void neon_find_peaks (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
|
|
+LIBARDOUR_API void neon_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
|
|
+LIBARDOUR_API void neon_mix_buffers_with_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
|
|
+LIBARDOUR_API void neon_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
|
+
|
|
+#endif
|
|
+
|
|
/* non-optimized functions */
|
|
|
|
LIBARDOUR_API float default_compute_peak (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current);
|
|
diff --git a/libs/ardour/globals.cc b/libs/ardour/globals.cc
|
|
index 28eb818..d562b35 100644
|
|
--- a/libs/ardour/globals.cc
|
|
+++ b/libs/ardour/globals.cc
|
|
@@ -222,6 +222,19 @@ setup_hardware_optimization (bool try_optimization)
|
|
|
|
info << "Apple VecLib H/W specific optimizations in use" << endmsg;
|
|
}
|
|
+#elif defined (__ARM_NEON__)
|
|
+ // No runtime detection
|
|
+ compute_peak = neon_compute_peak;
|
|
+ find_peaks = neon_find_peaks;
|
|
+ apply_gain_to_buffer = neon_apply_gain_to_buffer;
|
|
+ mix_buffers_with_gain = neon_mix_buffers_with_gain;
|
|
+ mix_buffers_no_gain = neon_mix_buffers_no_gain;
|
|
+ copy_vector = default_copy_vector;
|
|
+
|
|
+ generic_mix_functions = false;
|
|
+
|
|
+ info << "ARM NEON optimizations in use" << endmsg;
|
|
+
|
|
#endif
|
|
|
|
/* consider FPU denormal handling to be "h/w optimization" */
|
|
diff --git a/libs/ardour/mix.cc b/libs/ardour/mix.cc
|
|
index 96ae624..d1a46a2 100644
|
|
--- a/libs/ardour/mix.cc
|
|
+++ b/libs/ardour/mix.cc
|
|
@@ -182,4 +182,161 @@ veclib_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, pf
|
|
|
|
#endif
|
|
|
|
+#if defined (__ARM_NEON__)
|
|
+#include <arm_neon.h>
|
|
+
|
|
+float
|
|
+neon_compute_peak (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current)
|
|
+{
|
|
+ float32x4_t v4current = vdupq_n_f32(current);
|
|
+ float32x4_t v4work;
|
|
+ float32x2_t v2current;
|
|
+
|
|
+ // unaligned lead (spread single float to vector)
|
|
+ while (((intptr_t)buf) % 16 != 0 && nsamples > 0) {
|
|
+ v4work = vdupq_n_f32(*buf);
|
|
+ v4current = vmaxq_f32(v4current, vabsq_f32(v4work));
|
|
+
|
|
+ buf++;
|
|
+ nsamples--;
|
|
+ }
|
|
+ // aligned
|
|
+ while (nsamples >= 4) {
|
|
+ v4work = vld1q_f32(buf);
|
|
+ v4current = vmaxq_f32(v4current, vabsq_f32(v4work));
|
|
+
|
|
+ buf+=4;
|
|
+ nsamples-=4;
|
|
+ }
|
|
+ // rest < 4 (spread single float to vector)
|
|
+ while (nsamples > 0) {
|
|
+ v4work = vdupq_n_f32(*buf);
|
|
+ v4current = vmaxq_f32(v4current, vabsq_f32(v4work));
|
|
+
|
|
+ buf++;
|
|
+ nsamples--;
|
|
+ }
|
|
+
|
|
+ // calc max in vector by pairwise max done twice
|
|
+ v2current = vpmax_f32(vget_low_f32(v4current), vget_high_f32(v4current));
|
|
+ v2current = vpmax_f32(v2current, v2current);
|
|
+ return vget_lane_f32(v2current, 0);
|
|
+}
|
|
+
|
|
+void
|
|
+neon_find_peaks (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max)
|
|
+{
|
|
+ float32x4_t v4min = vdupq_n_f32(*min);
|
|
+ float32x4_t v4max = vdupq_n_f32(*max);
|
|
+ float32x4_t v4work;
|
|
+ float32x2_t v2min, v2max;
|
|
+
|
|
+ // vector
|
|
+ while (nsamples >= 4) {
|
|
+ v4work = vld1q_f32(buf);
|
|
+ v4min = vminq_f32(v4min, v4work);
|
|
+ v4max = vmaxq_f32(v4max, v4work);
|
|
+
|
|
+ buf+=4;
|
|
+ nsamples-=4;
|
|
+ }
|
|
+ // rest < 4 (spread single float to vector)
|
|
+ while (nsamples > 0) {
|
|
+ v4work = vdupq_n_f32(*buf);
|
|
+ v4min = vminq_f32(v4min, v4work);
|
|
+ v4max = vmaxq_f32(v4max, v4work);
|
|
+
|
|
+ buf++;
|
|
+ nsamples--;
|
|
+ }
|
|
+
|
|
+ // calc min in vector by pairwise max done twice
|
|
+ v2min = vpmin_f32(vget_low_f32(v4min), vget_high_f32(v4min));
|
|
+ v2min = vpmax_f32(v2min, v2min);
|
|
+ *min = vget_lane_f32(v2min, 0);
|
|
+ // calc max in vector by pairwise max done twice
|
|
+ v2max = vpmax_f32(vget_low_f32(v4max), vget_high_f32(v4max));
|
|
+ v2max = vpmax_f32(v2max, v2max);
|
|
+ *max = vget_lane_f32(v2max, 0);
|
|
+}
|
|
+
|
|
+void
|
|
+neon_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain)
|
|
+{
|
|
+ float32x4_t v4gain = vdupq_n_f32(gain);
|
|
+ float32x4_t v4work;
|
|
+
|
|
+ // vector
|
|
+ while (nframes >= 4) {
|
|
+ v4work = vld1q_f32(buf);
|
|
+ v4work = vmulq_f32(v4work, v4gain);
|
|
+ vst1q_f32(buf, v4work);
|
|
+
|
|
+ buf+=4;
|
|
+ nframes-=4;
|
|
+ }
|
|
+ // rest < 4
|
|
+ while (nframes > 0) {
|
|
+ *buf *= gain;
|
|
+
|
|
+ buf++;
|
|
+ nframes--;
|
|
+ }
|
|
+}
|
|
+
|
|
+void
|
|
+neon_mix_buffers_with_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain)
|
|
+{
|
|
+ float32x4_t v4gain = vdupq_n_f32(gain);
|
|
+ float32x4_t v4src, v4dst;
|
|
+
|
|
+ // vector
|
|
+ while (nframes >= 4) {
|
|
+ v4src = vld1q_f32(src);
|
|
+ v4dst = vld1q_f32(dst);
|
|
+ // v4dst = v4dst + v4src * v4gain
|
|
+ v4dst = vmlaq_f32(v4dst, v4src, v4gain);
|
|
+ vst1q_f32(dst, v4dst);
|
|
+
|
|
+ src+=4;
|
|
+ dst+=4;
|
|
+ nframes-=4;
|
|
+ }
|
|
+ // rest < 4
|
|
+ while (nframes > 0) {
|
|
+ *dst += *src * gain;
|
|
+
|
|
+ src++;
|
|
+ dst++;
|
|
+ nframes--;
|
|
+ }
|
|
+}
|
|
+
|
|
+void
|
|
+neon_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes)
|
|
+{
|
|
+ float32x4_t v4src, v4dst;
|
|
+
|
|
+ // vector
|
|
+ while (nframes >= 4) {
|
|
+ v4src = vld1q_f32(src);
|
|
+ v4dst = vld1q_f32(dst);
|
|
+ v4dst = vaddq_f32(v4dst, v4src);
|
|
+ vst1q_f32(dst, v4dst);
|
|
+
|
|
+ src+=4;
|
|
+ dst+=4;
|
|
+ nframes-=4;
|
|
+ }
|
|
+ // rest < 4
|
|
+ while (nframes > 0) {
|
|
+ *dst += *src;
|
|
+
|
|
+ src++;
|
|
+ dst++;
|
|
+ nframes--;
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif
|
|
|
|
--
|
|
2.9.4
|
|
|