From 610ce4e19b0b39d0e8391057b22163d4fdc7bdb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20M=C3=BCller?= Date: Mon, 3 Jul 2017 23:24:55 +0200 Subject: [PATCH 2/2] Use ARM NEON intrinsics if available for mixing functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Andreas Müller --- libs/ardour/ardour/mix.h | 10 +++ libs/ardour/globals.cc | 13 ++++ libs/ardour/mix.cc | 157 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 180 insertions(+) diff --git a/libs/ardour/ardour/mix.h b/libs/ardour/ardour/mix.h index 4676c01..55919f0 100644 --- a/libs/ardour/ardour/mix.h +++ b/libs/ardour/ardour/mix.h @@ -65,6 +65,16 @@ LIBARDOUR_API void veclib_mix_buffers_no_gain (ARDOUR::Sample * dst, cons #endif +#if defined (__ARM_NEON__) + +LIBARDOUR_API float neon_compute_peak (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current); +LIBARDOUR_API void neon_find_peaks (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max); +LIBARDOUR_API void neon_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain); +LIBARDOUR_API void neon_mix_buffers_with_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain); +LIBARDOUR_API void neon_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes); + +#endif + /* non-optimized functions */ LIBARDOUR_API float default_compute_peak (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current); diff --git a/libs/ardour/globals.cc b/libs/ardour/globals.cc index 28eb818..d562b35 100644 --- a/libs/ardour/globals.cc +++ b/libs/ardour/globals.cc @@ -222,6 +222,19 @@ setup_hardware_optimization (bool try_optimization) info << "Apple VecLib H/W specific optimizations in use" << endmsg; } +#elif defined (__ARM_NEON__) + // No runtime detection + compute_peak = neon_compute_peak; + find_peaks = neon_find_peaks; + apply_gain_to_buffer = neon_apply_gain_to_buffer; + mix_buffers_with_gain = neon_mix_buffers_with_gain; + mix_buffers_no_gain = neon_mix_buffers_no_gain; + copy_vector = default_copy_vector; + + generic_mix_functions = false; + + info << "ARM NEON optimizations in use" << endmsg; + #endif /* consider FPU denormal handling to be "h/w optimization" */ diff --git a/libs/ardour/mix.cc b/libs/ardour/mix.cc index 96ae624..d1a46a2 100644 --- a/libs/ardour/mix.cc +++ b/libs/ardour/mix.cc @@ -182,4 +182,161 @@ veclib_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, pf #endif +#if defined (__ARM_NEON__) +#include + +float +neon_compute_peak (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current) +{ + float32x4_t v4current = vdupq_n_f32(current); + float32x4_t v4work; + float32x2_t v2current; + + // unaligned lead (spread single float to vector) + while (((intptr_t)buf) % 16 != 0 && nsamples > 0) { + v4work = vdupq_n_f32(*buf); + v4current = vmaxq_f32(v4current, vabsq_f32(v4work)); + + buf++; + nsamples--; + } + // aligned + while (nsamples >= 4) { + v4work = vld1q_f32(buf); + v4current = vmaxq_f32(v4current, vabsq_f32(v4work)); + + buf+=4; + nsamples-=4; + } + // rest < 4 (spread single float to vector) + while (nsamples > 0) { + v4work = vdupq_n_f32(*buf); + v4current = vmaxq_f32(v4current, vabsq_f32(v4work)); + + buf++; + nsamples--; + } + + // calc max in vector by pairwise max done twice + v2current = vpmax_f32(vget_low_f32(v4current), vget_high_f32(v4current)); + v2current = vpmax_f32(v2current, v2current); + return vget_lane_f32(v2current, 0); +} + +void +neon_find_peaks (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max) +{ + float32x4_t v4min = vdupq_n_f32(*min); + float32x4_t v4max = vdupq_n_f32(*max); + float32x4_t v4work; + float32x2_t v2min, v2max; + + // vector + while (nsamples >= 4) { + v4work = vld1q_f32(buf); + v4min = vminq_f32(v4min, v4work); + v4max = vmaxq_f32(v4max, v4work); + + buf+=4; + nsamples-=4; + } + // rest < 4 (spread single float to vector) + while (nsamples > 0) { + v4work = vdupq_n_f32(*buf); + v4min = vminq_f32(v4min, v4work); + v4max = vmaxq_f32(v4max, v4work); + + buf++; + nsamples--; + } + + // calc min in vector by pairwise max done twice + v2min = vpmin_f32(vget_low_f32(v4min), vget_high_f32(v4min)); + v2min = vpmax_f32(v2min, v2min); + *min = vget_lane_f32(v2min, 0); + // calc max in vector by pairwise max done twice + v2max = vpmax_f32(vget_low_f32(v4max), vget_high_f32(v4max)); + v2max = vpmax_f32(v2max, v2max); + *max = vget_lane_f32(v2max, 0); +} + +void +neon_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain) +{ + float32x4_t v4gain = vdupq_n_f32(gain); + float32x4_t v4work; + + // vector + while (nframes >= 4) { + v4work = vld1q_f32(buf); + v4work = vmulq_f32(v4work, v4gain); + vst1q_f32(buf, v4work); + + buf+=4; + nframes-=4; + } + // rest < 4 + while (nframes > 0) { + *buf *= gain; + + buf++; + nframes--; + } +} + +void +neon_mix_buffers_with_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain) +{ + float32x4_t v4gain = vdupq_n_f32(gain); + float32x4_t v4src, v4dst; + + // vector + while (nframes >= 4) { + v4src = vld1q_f32(src); + v4dst = vld1q_f32(dst); + // v4dst = v4dst + v4src * v4gain + v4dst = vmlaq_f32(v4dst, v4src, v4gain); + vst1q_f32(dst, v4dst); + + src+=4; + dst+=4; + nframes-=4; + } + // rest < 4 + while (nframes > 0) { + *dst += *src * gain; + + src++; + dst++; + nframes--; + } +} + +void +neon_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes) +{ + float32x4_t v4src, v4dst; + + // vector + while (nframes >= 4) { + v4src = vld1q_f32(src); + v4dst = vld1q_f32(dst); + v4dst = vaddq_f32(v4dst, v4src); + vst1q_f32(dst, v4dst); + + src+=4; + dst+=4; + nframes-=4; + } + // rest < 4 + while (nframes > 0) { + *dst += *src; + + src++; + dst++; + nframes--; + } +} + +#endif -- 2.9.4