qtractor: add back my NEON patch - it got lost somewhere

Signed-off-by: Andreas Müller <schnitzeltony@gmail.com>
This commit is contained in:
Andreas Müller
2018-03-07 22:34:57 +01:00
parent 50acacac58
commit 66548544ca
2 changed files with 93 additions and 0 deletions

View File

@@ -0,0 +1,92 @@
From e96d295e0d6b36b9b722ad3d4c0b2013e569e9d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?= <schnitzeltony@googlemail.com>
Date: Tue, 3 Oct 2017 21:45:43 +0200
Subject: [PATCH] Add ARM NEON acceleration for time stretch - not yet tested
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Andreas Müller <schnitzeltony@googlemail.com>
---
src/qtractorTimeStretch.cpp | 58 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 58 insertions(+)
diff --git a/src/qtractorTimeStretch.cpp b/src/qtractorTimeStretch.cpp
index 751b9bc8..6461b8b9 100644
--- a/src/qtractorTimeStretch.cpp
+++ b/src/qtractorTimeStretch.cpp
@@ -121,6 +121,60 @@ static inline float sse_cross_corr (
#endif
+#if defined(__ARM_NEON__)
+#include "arm_neon.h"
+
+// NEON enabled version.
+static inline float neon_cross_corr (
+ const float *pV1, const float *pV2, unsigned int iOverlapLength )
+{
+ float32x4_t vCorr, vNorm, vTemp;
+
+ // See notes in sse_cross_corr
+
+ // Ensure overlapLength is divisible by 8
+ // assert((m_iOverlapLength % 8) == 0);
+ iOverlapLength >>= 4;
+
+ // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
+ vCorr = vdupq_n_f32(0.0);
+ vNorm = vdupq_n_f32(0.0);
+
+ // Unroll the loop by factor of 4 * 4 operations
+ for (unsigned int i = 0; i < iOverlapLength; ++i) {
+ // vCorr += pV1[0..3] * pV2[0..3]
+ vTemp = vld1q_f32(pV1);
+ vCorr = vmlaq_f32(vCorr, vTemp, vld1q_f32(pV2));
+ vNorm = vmlaq_f32(vNorm, vTemp, vTemp);
+ // vCorr += pV1[4..7] * pV2[4..7]
+ vTemp = vld1q_f32(pV1 + 4);
+ vCorr = vmlaq_f32(vCorr, vTemp, vld1q_f32(pV2 + 4));
+ vNorm = vmlaq_f32(vNorm, vTemp, vTemp);
+ // vCorr += pV1[8..11] * pV2[8..11]
+ vTemp = vld1q_f32(pV1 + 8);
+ vCorr = vmlaq_f32(vCorr, vTemp, vld1q_f32(pV2 + 8));
+ vNorm = vmlaq_f32(vNorm, vTemp, vTemp);
+ // vCorr += pV1[12..15] * pV2[12..15]
+ vTemp = vld1q_f32(pV1 + 12);
+ vCorr = vmlaq_f32(vCorr, vTemp, vld1q_f32(pV2 + 12));
+ vNorm = vmlaq_f32(vNorm, vTemp, vTemp);
+ pV1 += 16;
+ pV2 += 16;
+ }
+
+ float pvNorm[4];
+ vst1q_f32(pvNorm, vNorm);
+ float fNorm = (pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]);
+
+ if (fNorm < 1e-9f) fNorm = 1.0f; // avoid div by zero
+
+ float pvCorr[4];
+ vst1q_f32(pvCorr, vCorr);
+ return (pvCorr[0] + pvCorr[1] + pvCorr[2] + pvCorr[3]) / ::sqrtf(fNorm);
+}
+
+#endif
+
// Standard (slow) version.
static inline float std_cross_corr (
@@ -166,6 +220,10 @@ qtractorTimeStretch::qtractorTimeStretch (
if (sse_enabled())
m_pfnCrossCorr = sse_cross_corr;
else
+#endif
+#if defined(__ARM_NEON__)
+ m_pfnCrossCorr = neon_cross_corr;
+ if(false)
#endif
m_pfnCrossCorr = std_cross_corr;
--
2.14.3

View File

@@ -23,6 +23,7 @@ SRC_URI = " \
\
file://0001-do-nor-try-run-for-float-sse-detection.patch \
file://0002-do-nor-try-run-for-suil-libs-detection.patch \
file://0003-Add-ARM-NEON-acceleration-for-time-stretch-not-yet-t.patch \
file://Qtractor.conf \
"
SRCREV = "010bd49a69df14f457770bc29824a4af15a2ee50"