summaryrefslogtreecommitdiff
path: root/media/libsoundtouch/src/TDStretch.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'media/libsoundtouch/src/TDStretch.cpp')
-rw-r--r--media/libsoundtouch/src/TDStretch.cpp443
1 files changed, 290 insertions, 153 deletions
diff --git a/media/libsoundtouch/src/TDStretch.cpp b/media/libsoundtouch/src/TDStretch.cpp
index b955bfc961..709e979d1d 100644
--- a/media/libsoundtouch/src/TDStretch.cpp
+++ b/media/libsoundtouch/src/TDStretch.cpp
@@ -1,11 +1,17 @@
-////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
///
/// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo
/// while maintaining the original pitch by using a time domain WSOLA-like
/// method with several performance-increasing tweaks.
///
-/// Note : MMX optimized functions reside in a separate, platform-specific
-/// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
+/// Notes : MMX optimized functions reside in a separate, platform-specific
+/// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'.
+///
+/// This source file contains OpenMP optimizations that allow speeding up the
+/// corss-correlation algorithm by executing it in several threads / CPU cores
+/// in parallel. See the following article link for more detailed discussion
+/// about SoundTouch OpenMP optimizations:
+/// http://www.softwarecoven.com/parallel-computing-in-embedded-mobile-devices
///
/// Author : Copyright (c) Olli Parviainen
/// Author e-mail : oparviai 'at' iki.fi
@@ -13,13 +19,6 @@
///
////////////////////////////////////////////////////////////////////////////////
//
-// Last changed : $Date: 2015-02-22 15:07:12 +0000 (Sun, 22 Feb 2015) $
-// File revision : $Revision: 1.12 $
-//
-// $Id: TDStretch.cpp 205 2015-02-22 15:07:12Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
// License :
//
// SoundTouch audio processing library
@@ -55,7 +54,6 @@ using namespace soundtouch;
#define max(x, y) (((x) > (y)) ? (x) : (y))
-
/*****************************************************************************
*
* Constant definitions
@@ -63,7 +61,7 @@ using namespace soundtouch;
*****************************************************************************/
// Table for the hierarchical mixing position seeking algorithm
-static const short _scanOffsets[5][24]={
+const short _scanOffsets[5][24]={
{ 124, 186, 248, 310, 372, 434, 496, 558, 620, 682, 744, 806,
868, 930, 992, 1054, 1116, 1178, 1240, 1302, 1364, 1426, 1488, 0},
{-100, -75, -50, -25, 25, 50, 75, 100, 0, 0, 0, 0,
@@ -94,9 +92,6 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
bAutoSeqSetting = true;
bAutoSeekSetting = true;
-// outDebt = 0;
- skipFract = 0;
-
tempo = 1.0f;
setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
setTempo(1.0f);
@@ -202,7 +197,7 @@ void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const
m1 = (SAMPLETYPE)0;
m2 = (SAMPLETYPE)overlapLength;
- for (i = 0; i < overlapLength ; i ++)
+ for (i = 0; i < overlapLength ; i ++)
{
pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength;
m1 += 1;
@@ -222,6 +217,10 @@ void TDStretch::clearInput()
{
inputBuffer.clear();
clearMidBuffer();
+ isBeginning = true;
+ maxnorm = 0;
+ maxnormf = 1e8;
+ skipFract = 0;
}
@@ -255,7 +254,7 @@ int TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos)
if (bQuickSeek)
{
return seekBestOverlapPositionQuick(refPos);
- }
+ }
else
{
return seekBestOverlapPositionFull(refPos);
@@ -287,7 +286,6 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui
}
-
// Seeks for the optimal overlap-mixing position. The 'stereo' version of the
// routine
//
@@ -301,21 +299,23 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
int i;
double norm;
- bestCorr = FLT_MIN;
+ bestCorr = -FLT_MAX;
bestOffs = 0;
// Scans for the best correlation value by testing each possible position
// over the permitted range.
bestCorr = calcCrossCorr(refPos, pMidBuffer, norm);
+ bestCorr = (bestCorr + 0.1) * 0.75;
#pragma omp parallel for
- for (i = 1; i < seekLength; i ++)
+ for (i = 1; i < seekLength; i ++)
{
double corr;
// Calculates correlation value for the mixing position corresponding to 'i'
-#ifdef _OPENMP
+#if defined(_OPENMP) || defined(ST_SIMD_AVOID_UNALIGNED)
// in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't
// iterate the loop in sequential order
+ // in SIMD mode, avoid accumulator version to allow avoiding unaligned positions
corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm);
#else
// In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same
@@ -341,6 +341,11 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
}
}
}
+
+#ifdef SOUNDTOUCH_INTEGER_SAMPLES
+ adaptNormalizer();
+#endif
+
// clear cross correlation routine state if necessary (is so e.g. in MMX routines).
clearCrossCorrState();
@@ -348,64 +353,157 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
}
-// Seeks for the optimal overlap-mixing position. The 'stereo' version of the
-// routine
+// Quick seek algorithm for improved runtime-performance: First roughly scans through the
+// correlation area, and then scan surroundings of two best preliminary correlation candidates
+// with improved precision
//
-// The best position is determined as the position where the two overlapped
-// sample sequences are 'most alike', in terms of the highest cross-correlation
-// value over the overlapping period
-int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos)
+// Based on testing:
+// - This algorithm gives on average 99% as good match as the full algorithm
+// - this quick seek algorithm finds the best match on ~90% of cases
+// - on those 10% of cases when this algorithm doesn't find best match,
+// it still finds on average ~90% match vs. the best possible match
+int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos)
{
- int j;
+#define _MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define SCANSTEP 16
+#define SCANWIND 8
+
int bestOffs;
- double bestCorr, corr;
- int scanCount, corrOffset, tempOffset;
+ int i;
+ int bestOffs2;
+ float bestCorr, corr;
+ float bestCorr2;
+ double norm;
+
+ // note: 'float' types used in this function in case that the platform would need to use software-fp
- bestCorr = FLT_MIN;
- bestOffs = _scanOffsets[0][0];
- corrOffset = 0;
- tempOffset = 0;
+ bestCorr =
+ bestCorr2 = -FLT_MAX;
+ bestOffs =
+ bestOffs2 = SCANWIND;
- // Scans for the best correlation value using four-pass hierarchical search.
+ // Scans for the best correlation value by testing each possible position
+ // over the permitted range. Look for two best matches on the first pass to
+ // increase possibility of ideal match.
//
- // The look-up table 'scans' has hierarchical position adjusting steps.
- // In first pass the routine searhes for the highest correlation with
- // relatively coarse steps, then rescans the neighbourhood of the highest
- // correlation with better resolution and so on.
- for (scanCount = 0;scanCount < 4; scanCount ++)
- {
- j = 0;
- while (_scanOffsets[scanCount][j])
+ // Begin from "SCANSTEP" instead of SCANWIND to make the calculation
+ // catch the 'middlepoint' of seekLength vector as that's the a-priori
+ // expected best match position
+ //
+ // Roughly:
+ // - 15% of cases find best result directly on the first round,
+ // - 75% cases find better match on 2nd round around the best match from 1st round
+ // - 10% cases find better match on 2nd round around the 2nd-best-match from 1st round
+ for (i = SCANSTEP; i < seekLength - SCANWIND - 1; i += SCANSTEP)
+ {
+ // Calculates correlation value for the mixing position corresponding
+ // to 'i'
+ corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
+ // heuristic rule to slightly favour values close to mid of the seek range
+ float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
+ corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
+
+ // Checks for the highest correlation value
+ if (corr > bestCorr)
{
- double norm;
- tempOffset = corrOffset + _scanOffsets[scanCount][j];
- if (tempOffset >= seekLength) break;
-
- // Calculates correlation value for the mixing position corresponding
- // to 'tempOffset'
- corr = (double)calcCrossCorr(refPos + channels * tempOffset, pMidBuffer, norm);
- // heuristic rule to slightly favour values close to mid of the range
- double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
- corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
-
- // Checks for the highest correlation value
- if (corr > bestCorr)
- {
- bestCorr = corr;
- bestOffs = tempOffset;
- }
- j ++;
+ // found new best match. keep the previous best as 2nd best match
+ bestCorr2 = bestCorr;
+ bestOffs2 = bestOffs;
+ bestCorr = corr;
+ bestOffs = i;
+ }
+ else if (corr > bestCorr2)
+ {
+ // not new best, but still new 2nd best match
+ bestCorr2 = corr;
+ bestOffs2 = i;
+ }
+ }
+
+ // Scans surroundings of the found best match with small stepping
+ int end = _MIN(bestOffs + SCANWIND + 1, seekLength);
+ for (i = bestOffs - SCANWIND; i < end; i++)
+ {
+ if (i == bestOffs) continue; // this offset already calculated, thus skip
+
+ // Calculates correlation value for the mixing position corresponding
+ // to 'i'
+ corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
+ // heuristic rule to slightly favour values close to mid of the range
+ float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
+ corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
+
+ // Checks for the highest correlation value
+ if (corr > bestCorr)
+ {
+ bestCorr = corr;
+ bestOffs = i;
+ }
+ }
+
+ // Scans surroundings of the 2nd best match with small stepping
+ end = _MIN(bestOffs2 + SCANWIND + 1, seekLength);
+ for (i = bestOffs2 - SCANWIND; i < end; i++)
+ {
+ if (i == bestOffs2) continue; // this offset already calculated, thus skip
+
+ // Calculates correlation value for the mixing position corresponding
+ // to 'i'
+ corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
+ // heuristic rule to slightly favour values close to mid of the range
+ float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
+ corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
+
+ // Checks for the highest correlation value
+ if (corr > bestCorr)
+ {
+ bestCorr = corr;
+ bestOffs = i;
}
- corrOffset = bestOffs;
}
+
// clear cross correlation routine state if necessary (is so e.g. in MMX routines).
clearCrossCorrState();
+#ifdef SOUNDTOUCH_INTEGER_SAMPLES
+ adaptNormalizer();
+#endif
+
return bestOffs;
}
+
+/// For integer algorithm: adapt normalization factor divider with music so that
+/// it'll not be pessimistically restrictive that can degrade quality on quieter sections
+/// yet won't cause integer overflows either
+void TDStretch::adaptNormalizer()
+{
+ // Do not adapt normalizer over too silent sequences to avoid averaging filter depleting to
+ // too low values during pauses in music
+ if ((maxnorm > 1000) || (maxnormf > 40000000))
+ {
+ //norm averaging filter
+ maxnormf = 0.9f * maxnormf + 0.1f * (float)maxnorm;
+
+ if ((maxnorm > 800000000) && (overlapDividerBitsNorm < 16))
+ {
+ // large values, so increase divider
+ overlapDividerBitsNorm++;
+ if (maxnorm > 1600000000) overlapDividerBitsNorm++; // extra large value => extra increase
+ }
+ else if ((maxnormf < 1000000) && (overlapDividerBitsNorm > 0))
+ {
+ // extra small values, decrease divider
+ overlapDividerBitsNorm--;
+ }
+ }
+
+ maxnorm = 0;
+}
+
+
/// clear cross correlation routine state if necessary
void TDStretch::clearCrossCorrState()
{
@@ -417,18 +515,18 @@ void TDStretch::clearCrossCorrState()
void TDStretch::calcSeqParameters()
{
// Adjust tempo param according to tempo, so that variating processing sequence length is used
- // at varius tempo settings, between the given low...top limits
+ // at various tempo settings, between the given low...top limits
#define AUTOSEQ_TEMPO_LOW 0.5 // auto setting low tempo range (-50%)
#define AUTOSEQ_TEMPO_TOP 2.0 // auto setting top tempo range (+100%)
// sequence-ms setting values at above low & top tempo
- #define AUTOSEQ_AT_MIN 125.0
- #define AUTOSEQ_AT_MAX 50.0
+ #define AUTOSEQ_AT_MIN 90.0
+ #define AUTOSEQ_AT_MAX 40.0
#define AUTOSEQ_K ((AUTOSEQ_AT_MAX - AUTOSEQ_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
#define AUTOSEQ_C (AUTOSEQ_AT_MIN - (AUTOSEQ_K) * (AUTOSEQ_TEMPO_LOW))
- // seek-window-ms setting values at above low & top tempo
- #define AUTOSEEK_AT_MIN 25.0
+ // seek-window-ms setting values at above low & top tempoq
+ #define AUTOSEEK_AT_MIN 20.0
#define AUTOSEEK_AT_MAX 15.0
#define AUTOSEEK_K ((AUTOSEEK_AT_MAX - AUTOSEEK_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
#define AUTOSEEK_C (AUTOSEEK_AT_MIN - (AUTOSEEK_K) * (AUTOSEQ_TEMPO_LOW))
@@ -464,7 +562,7 @@ void TDStretch::calcSeqParameters()
// Sets new target tempo. Normal tempo = 'SCALE', smaller values represent slower
// tempo, larger faster tempo.
-void TDStretch::setTempo(float newTempo)
+void TDStretch::setTempo(double newTempo)
{
int intskip;
@@ -475,7 +573,7 @@ void TDStretch::setTempo(float newTempo)
// Calculate ideal skip length (according to tempo value)
nominalSkip = tempo * (seekWindowLength - overlapLength);
- intskip = (int)(nominalSkip + 0.5f);
+ intskip = (int)(nominalSkip + 0.5);
// Calculate how many samples are needed in the 'inputBuffer' to
// process another batch of samples
@@ -488,9 +586,8 @@ void TDStretch::setTempo(float newTempo)
// Sets the number of channels, 1 = mono, 2 = stereo
void TDStretch::setChannels(int numChannels)
{
- assert(numChannels > 0);
- if (channels == numChannels) return;
-// assert(numChannels == 1 || numChannels == 2);
+ if (!verifyNumberOfChannels(numChannels) ||
+ (channels == numChannels)) return;
channels = numChannels;
inputBuffer.setChannels(channels);
@@ -539,7 +636,8 @@ void TDStretch::processNominalTempo()
// the result into 'outputBuffer'
void TDStretch::processSamples()
{
- int ovlSkip, offset;
+ int ovlSkip;
+ int offset = 0;
int temp;
/* Removed this small optimization - can introduce a click to sound when tempo setting
@@ -556,35 +654,62 @@ void TDStretch::processSamples()
// to form a processing frame.
while ((int)inputBuffer.numSamples() >= sampleReq)
{
- // If tempo differs from the normal ('SCALE'), scan for the best overlapping
- // position
- offset = seekBestOverlapPosition(inputBuffer.ptrBegin());
-
- // Mix the samples in the 'inputBuffer' at position of 'offset' with the
- // samples in 'midBuffer' using sliding overlapping
- // ... first partially overlap with the end of the previous sequence
- // (that's in 'midBuffer')
- overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
- outputBuffer.putSamples((uint)overlapLength);
+ if (isBeginning == false)
+ {
+ // apart from the very beginning of the track,
+ // scan for the best overlapping position & do overlap-add
+ offset = seekBestOverlapPosition(inputBuffer.ptrBegin());
+
+ // Mix the samples in the 'inputBuffer' at position of 'offset' with the
+ // samples in 'midBuffer' using sliding overlapping
+ // ... first partially overlap with the end of the previous sequence
+ // (that's in 'midBuffer')
+ overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
+ outputBuffer.putSamples((uint)overlapLength);
+ offset += overlapLength;
+ }
+ else
+ {
+ // Adjust processing offset at beginning of track by not perform initial overlapping
+ // and compensating that in the 'input buffer skip' calculation
+ isBeginning = false;
+ int skip = (int)(tempo * overlapLength + 0.5 * seekLength + 0.5);
+
+ #ifdef ST_SIMD_AVOID_UNALIGNED
+ // in SIMD mode, round the skip amount to value corresponding to aligned memory address
+ if (channels == 1)
+ {
+ skip &= -4;
+ }
+ else if (channels == 2)
+ {
+ skip &= -2;
+ }
+ #endif
+ skipFract -= skip;
+ if (skipFract <= -nominalSkip)
+ {
+ skipFract = -nominalSkip;
+ }
+ }
// ... then copy sequence samples from 'inputBuffer' to output:
- // length of sequence
- temp = (seekWindowLength - 2 * overlapLength);
-
// crosscheck that we don't have buffer overflow...
- if ((int)inputBuffer.numSamples() < (offset + temp + overlapLength * 2))
+ if ((int)inputBuffer.numSamples() < (offset + seekWindowLength - overlapLength))
{
continue; // just in case, shouldn't really happen
}
- outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp);
+ // length of sequence
+ temp = (seekWindowLength - 2 * overlapLength);
+ outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * offset, (uint)temp);
// Copies the end of the current sequence from 'inputBuffer' to
// 'midBuffer' for being mixed with the beginning of the next
// processing sequence and so on
- assert((offset + temp + overlapLength * 2) <= (int)inputBuffer.numSamples());
- memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp + overlapLength),
+ assert((offset + temp + overlapLength) <= (int)inputBuffer.numSamples());
+ memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp),
channels * sizeof(SAMPLETYPE) * overlapLength);
// Remove the processed samples from the input buffer. Update
@@ -680,7 +805,7 @@ TDStretch * TDStretch::newInstance()
//////////////////////////////////////////////////////////////////////////////
//
-// Integer arithmetics specific algorithm implementations.
+// Integer arithmetic specific algorithm implementations.
//
//////////////////////////////////////////////////////////////////////////////
@@ -694,7 +819,7 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const
short temp;
int cnt2;
- for (i = 0; i < overlapLength ; i ++)
+ for (i = 0; i < overlapLength ; i ++)
{
temp = (short)(overlapLength - i);
cnt2 = 2 * i;
@@ -706,21 +831,19 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const
// Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Multi'
// version of the routine.
-void TDStretch::overlapMulti(SAMPLETYPE *poutput, const SAMPLETYPE *input) const
+void TDStretch::overlapMulti(short *poutput, const short *input) const
{
- SAMPLETYPE m1=(SAMPLETYPE)0;
- SAMPLETYPE m2;
- int i=0;
+ short m1;
+ int i = 0;
- for (m2 = (SAMPLETYPE)overlapLength; m2; m2 --)
+ for (m1 = 0; m1 < overlapLength; m1 ++)
{
+ short m2 = (short)(overlapLength - m1);
for (int c = 0; c < channels; c ++)
{
poutput[i] = (input[i] * m1 + pMidBuffer[i] * m2) / overlapLength;
i++;
}
-
- m1++;
}
}
@@ -743,13 +866,15 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
// calculate overlap length so that it's power of 2 - thus it's easy to do
// integer division by right-shifting. Term "-1" at end is to account for
// the extra most significatnt bit left unused in result by signed multiplication
- overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
- if (overlapDividerBits > 9) overlapDividerBits = 9;
- if (overlapDividerBits < 3) overlapDividerBits = 3;
- newOvl = (int)pow(2.0, (int)overlapDividerBits + 1); // +1 => account for -1 above
+ overlapDividerBitsPure = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
+ if (overlapDividerBitsPure > 9) overlapDividerBitsPure = 9;
+ if (overlapDividerBitsPure < 3) overlapDividerBitsPure = 3;
+ newOvl = (int)pow(2.0, (int)overlapDividerBitsPure + 1); // +1 => account for -1 above
acceptNewOverlapLength(newOvl);
+ overlapDividerBitsNorm = overlapDividerBitsPure;
+
// calculate sloping divider so that crosscorrelation operation won't
// overflow 32-bit register. Max. sum of the crosscorrelation sum without
// divider would be 2^30*(N^3-N)/3, where N = overlap length
@@ -757,28 +882,40 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
}
-double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, double &norm) const
+double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, double &norm)
{
long corr;
- long lnorm;
+ unsigned long lnorm;
int i;
+ #ifdef ST_SIMD_AVOID_UNALIGNED
+ // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
+ if (((ulongptr)mixingPos) & 15) return -1e50;
+ #endif
+
+ // hint compiler autovectorization that loop length is divisible by 8
+ int ilength = (channels * overlapLength) & -8;
+
corr = lnorm = 0;
- // Same routine for stereo and mono. For stereo, unroll loop for better
- // efficiency and gives slightly better resolution against rounding.
- // For mono it same routine, just unrolls loop by factor of 4
- for (i = 0; i < channels * overlapLength; i += 4)
+ // Same routine for stereo and mono
+ for (i = 0; i < ilength; i += 2)
{
corr += (mixingPos[i] * compare[i] +
- mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow
- corr += (mixingPos[i + 2] * compare[i + 2] +
- mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits;
+ mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
lnorm += (mixingPos[i] * mixingPos[i] +
- mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow
- lnorm += (mixingPos[i + 2] * mixingPos[i + 2] +
- mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBits;
+ mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm;
+ // do intermediate scalings to avoid integer overflow
}
+ if (lnorm > maxnorm)
+ {
+ // modify 'maxnorm' inside critical section to avoid multi-access conflict if in OpenMP mode
+ #pragma omp critical
+ if (lnorm > maxnorm)
+ {
+ maxnorm = lnorm;
+ }
+ }
// Normalize result by dividing by sqrt(norm) - this step is easiest
// done using floating point operation
norm = (double)lnorm;
@@ -787,38 +924,42 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
-double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm) const
+double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm)
{
long corr;
long lnorm;
int i;
+ // hint compiler autovectorization that loop length is divisible by 8
+ int ilength = (channels * overlapLength) & -8;
+
// cancel first normalizer tap from previous round
lnorm = 0;
for (i = 1; i <= channels; i ++)
{
- lnorm -= (mixingPos[-i] * mixingPos[-i]) >> overlapDividerBits;
+ lnorm -= (mixingPos[-i] * mixingPos[-i]) >> overlapDividerBitsNorm;
}
corr = 0;
- // Same routine for stereo and mono. For stereo, unroll loop for better
- // efficiency and gives slightly better resolution against rounding.
- // For mono it same routine, just unrolls loop by factor of 4
- for (i = 0; i < channels * overlapLength; i += 4)
+ // Same routine for stereo and mono.
+ for (i = 0; i < ilength; i += 2)
{
corr += (mixingPos[i] * compare[i] +
- mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow
- corr += (mixingPos[i + 2] * compare[i + 2] +
- mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits;
+ mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
}
// update normalizer with last samples of this round
for (int j = 0; j < channels; j ++)
{
i --;
- lnorm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBits;
+ lnorm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBitsNorm;
}
+
norm += (double)lnorm;
+ if (norm > maxnorm)
+ {
+ maxnorm = (unsigned long)norm;
+ }
// Normalize result by dividing by sqrt(norm) - this step is easiest
// done using floating point operation
@@ -829,7 +970,7 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c
//////////////////////////////////////////////////////////////////////////////
//
-// Floating point arithmetics specific algorithm implementations.
+// Floating point arithmetic specific algorithm implementations.
//
#ifdef SOUNDTOUCH_FLOAT_SAMPLES
@@ -903,29 +1044,26 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
/// Calculate cross-correlation
-double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm) const
+double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm)
{
- double corr;
- double norm;
+ float corr;
+ float norm;
int i;
- corr = norm = 0;
- // Same routine for stereo and mono. For Stereo, unroll by factor of 2.
- // For mono it's same routine yet unrollsd by factor of 4.
- for (i = 0; i < channels * overlapLength; i += 4)
- {
- corr += mixingPos[i] * compare[i] +
- mixingPos[i + 1] * compare[i + 1];
-
- norm += mixingPos[i] * mixingPos[i] +
- mixingPos[i + 1] * mixingPos[i + 1];
+ #ifdef ST_SIMD_AVOID_UNALIGNED
+ // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
+ if (((ulongptr)mixingPos) & 15) return -1e50;
+ #endif
- // unroll the loop for better CPU efficiency:
- corr += mixingPos[i + 2] * compare[i + 2] +
- mixingPos[i + 3] * compare[i + 3];
+ // hint compiler autovectorization that loop length is divisible by 8
+ int ilength = (channels * overlapLength) & -8;
- norm += mixingPos[i + 2] * mixingPos[i + 2] +
- mixingPos[i + 3] * mixingPos[i + 3];
+ corr = norm = 0;
+ // Same routine for stereo and mono
+ for (i = 0; i < ilength; i ++)
+ {
+ corr += mixingPos[i] * compare[i];
+ norm += mixingPos[i] * mixingPos[i];
}
anorm = norm;
@@ -934,9 +1072,9 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do
/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
-double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm) const
+double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm)
{
- double corr;
+ float corr;
int i;
corr = 0;
@@ -947,14 +1085,13 @@ double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *c
norm -= mixingPos[-i] * mixingPos[-i];
}
- // Same routine for stereo and mono. For Stereo, unroll by factor of 2.
- // For mono it's same routine yet unrollsd by factor of 4.
- for (i = 0; i < channels * overlapLength; i += 4)
+ // hint compiler autovectorization that loop length is divisible by 8
+ int ilength = (channels * overlapLength) & -8;
+
+ // Same routine for stereo and mono
+ for (i = 0; i < ilength; i ++)
{
- corr += mixingPos[i] * compare[i] +
- mixingPos[i + 1] * compare[i + 1] +
- mixingPos[i + 2] * compare[i + 2] +
- mixingPos[i + 3] * compare[i + 3];
+ corr += mixingPos[i] * compare[i];
}
// update normalizer with last samples of this round