1 files changed, 290 insertions, 153 deletions
diff --git a/media/libsoundtouch/src/TDStretch.cpp b/media/libsoundtouch/src/TDStretch.cpp
index b955bfc961..709e979d1d 100644
--- a/media/libsoundtouch/src/TDStretch.cpp
+++ b/media/libsoundtouch/src/TDStretch.cpp
@@ -1,11 +1,17 @@
-////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
 /// 
 /// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo 
 /// while maintaining the original pitch by using a time domain WSOLA-like 
 /// method with several performance-increasing tweaks.
 ///
-/// Note : MMX optimized functions reside in a separate, platform-specific 
-/// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
+/// Notes : MMX optimized functions reside in a separate, platform-specific 
+/// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'.
+///
+/// This source file contains OpenMP optimizations that allow speeding up the
+/// corss-correlation algorithm by executing it in several threads / CPU cores 
+/// in parallel. See the following article link for more detailed discussion 
+/// about SoundTouch OpenMP optimizations:
+/// http://www.softwarecoven.com/parallel-computing-in-embedded-mobile-devices
 ///
 /// Author        : Copyright (c) Olli Parviainen
 /// Author e-mail : oparviai 'at' iki.fi
@@ -13,13 +19,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-02-22 15:07:12 +0000 (Sun, 22 Feb 2015) $
-// File revision : $Revision: 1.12 $
-//
-// $Id: TDStretch.cpp 205 2015-02-22 15:07:12Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -55,7 +54,6 @@ using namespace soundtouch;
 
 #define max(x, y) (((x) > (y)) ? (x) : (y))
 
-
 /*****************************************************************************
  *
  * Constant definitions
@@ -63,7 +61,7 @@ using namespace soundtouch;
  *****************************************************************************/
 
 // Table for the hierarchical mixing position seeking algorithm
-static const short _scanOffsets[5][24]={
+const short _scanOffsets[5][24]={
     { 124,  186,  248,  310,  372,  434,  496,  558,  620,  682,  744, 806,
       868,  930,  992, 1054, 1116, 1178, 1240, 1302, 1364, 1426, 1488,   0},
     {-100,  -75,  -50,  -25,   25,   50,   75,  100,    0,    0,    0,   0,
@@ -94,9 +92,6 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
     bAutoSeqSetting = true;
     bAutoSeekSetting = true;
 
-//    outDebt = 0;
-    skipFract = 0;
-
     tempo = 1.0f;
     setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
     setTempo(1.0f);
@@ -202,7 +197,7 @@ void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const
     m1 = (SAMPLETYPE)0;
     m2 = (SAMPLETYPE)overlapLength;
 
-    for (i = 0; i < overlapLength ; i ++) 
+    for (i = 0; i < overlapLength ; i ++)
     {
         pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength;
         m1 += 1;
@@ -222,6 +217,10 @@ void TDStretch::clearInput()
 {
     inputBuffer.clear();
     clearMidBuffer();
+    isBeginning = true;
+    maxnorm = 0;
+    maxnormf = 1e8;
+    skipFract = 0;
 }
 
 
@@ -255,7 +254,7 @@ int TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos)
     if (bQuickSeek) 
     {
         return seekBestOverlapPositionQuick(refPos);
-    } 
+    }
     else 
     {
         return seekBestOverlapPositionFull(refPos);
@@ -287,7 +286,6 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui
 }
 
 
-
 // Seeks for the optimal overlap-mixing position. The 'stereo' version of the
 // routine
 //
@@ -301,21 +299,23 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
     int i;
     double norm;
 
-    bestCorr = FLT_MIN;
+    bestCorr = -FLT_MAX;
     bestOffs = 0;
 
     // Scans for the best correlation value by testing each possible position
     // over the permitted range.
     bestCorr = calcCrossCorr(refPos, pMidBuffer, norm);
+    bestCorr = (bestCorr + 0.1) * 0.75;
 
     #pragma omp parallel for
-    for (i = 1; i < seekLength; i ++) 
+    for (i = 1; i < seekLength; i ++)
     {
         double corr;
         // Calculates correlation value for the mixing position corresponding to 'i'
-#ifdef _OPENMP
+#if defined(_OPENMP) || defined(ST_SIMD_AVOID_UNALIGNED)
         // in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't
         // iterate the loop in sequential order
+        // in SIMD mode, avoid accumulator version to allow avoiding unaligned positions
         corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm);
 #else
         // In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same
@@ -341,6 +341,11 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
             }
         }
     }
+
+#ifdef SOUNDTOUCH_INTEGER_SAMPLES
+    adaptNormalizer();
+#endif
+
     // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
     clearCrossCorrState();
 
@@ -348,64 +353,157 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
 }
 
 
-// Seeks for the optimal overlap-mixing position. The 'stereo' version of the
-// routine
+// Quick seek algorithm for improved runtime-performance: First roughly scans through the 
+// correlation area, and then scan surroundings of two best preliminary correlation candidates
+// with improved precision
 //
-// The best position is determined as the position where the two overlapped
-// sample sequences are 'most alike', in terms of the highest cross-correlation
-// value over the overlapping period
-int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos) 
+// Based on testing:
+// - This algorithm gives on average 99% as good match as the full algorithm
+// - this quick seek algorithm finds the best match on ~90% of cases
+// - on those 10% of cases when this algorithm doesn't find best match, 
+//   it still finds on average ~90% match vs. the best possible match
+int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos)
 {
-    int j;
+#define _MIN(a, b)   (((a) < (b)) ? (a) : (b))
+#define SCANSTEP    16
+#define SCANWIND    8
+
     int bestOffs;
-    double bestCorr, corr;
-    int scanCount, corrOffset, tempOffset;
+    int i;
+    int bestOffs2;
+    float bestCorr, corr;
+    float bestCorr2;
+    double norm;
+
+    // note: 'float' types used in this function in case that the platform would need to use software-fp
 
-    bestCorr = FLT_MIN;
-    bestOffs = _scanOffsets[0][0];
-    corrOffset = 0;
-    tempOffset = 0;
+    bestCorr =
+    bestCorr2 = -FLT_MAX;
+    bestOffs = 
+    bestOffs2 = SCANWIND;
 
-    // Scans for the best correlation value using four-pass hierarchical search.
+    // Scans for the best correlation value by testing each possible position
+    // over the permitted range. Look for two best matches on the first pass to
+    // increase possibility of ideal match.
     //
-    // The look-up table 'scans' has hierarchical position adjusting steps.
-    // In first pass the routine searhes for the highest correlation with 
-    // relatively coarse steps, then rescans the neighbourhood of the highest
-    // correlation with better resolution and so on.
-    for (scanCount = 0;scanCount < 4; scanCount ++) 
-    {
-        j = 0;
-        while (_scanOffsets[scanCount][j]) 
+    // Begin from "SCANSTEP" instead of SCANWIND to make the calculation
+    // catch the 'middlepoint' of seekLength vector as that's the a-priori 
+    // expected best match position
+    //
+    // Roughly:
+    // - 15% of cases find best result directly on the first round,
+    // - 75% cases find better match on 2nd round around the best match from 1st round
+    // - 10% cases find better match on 2nd round around the 2nd-best-match from 1st round
+    for (i = SCANSTEP; i < seekLength - SCANWIND - 1; i += SCANSTEP)
+    {
+        // Calculates correlation value for the mixing position corresponding
+        // to 'i'
+        corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
+        // heuristic rule to slightly favour values close to mid of the seek range
+        float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
+        corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
+
+        // Checks for the highest correlation value
+        if (corr > bestCorr)
         {
-            double norm;
-            tempOffset = corrOffset + _scanOffsets[scanCount][j];
-            if (tempOffset >= seekLength) break;
-
-            // Calculates correlation value for the mixing position corresponding
-            // to 'tempOffset'
-            corr = (double)calcCrossCorr(refPos + channels * tempOffset, pMidBuffer, norm);
-            // heuristic rule to slightly favour values close to mid of the range
-            double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
-            corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
-
-            // Checks for the highest correlation value
-            if (corr > bestCorr) 
-            {
-                bestCorr = corr;
-                bestOffs = tempOffset;
-            }
-            j ++;
+            // found new best match. keep the previous best as 2nd best match
+            bestCorr2 = bestCorr;
+            bestOffs2 = bestOffs;
+            bestCorr = corr;
+            bestOffs = i;
+        }
+        else if (corr > bestCorr2)
+        {
+            // not new best, but still new 2nd best match
+            bestCorr2 = corr;
+            bestOffs2 = i;
+        }
+    }
+
+    // Scans surroundings of the found best match with small stepping
+    int end = _MIN(bestOffs + SCANWIND + 1, seekLength);
+    for (i = bestOffs - SCANWIND; i < end; i++)
+    {
+        if (i == bestOffs) continue;    // this offset already calculated, thus skip
+
+        // Calculates correlation value for the mixing position corresponding
+        // to 'i'
+        corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
+        // heuristic rule to slightly favour values close to mid of the range
+        float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
+        corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
+
+        // Checks for the highest correlation value
+        if (corr > bestCorr)
+        {
+            bestCorr = corr;
+            bestOffs = i;
+        }
+    }
+
+    // Scans surroundings of the 2nd best match with small stepping
+    end = _MIN(bestOffs2 + SCANWIND + 1, seekLength);
+    for (i = bestOffs2 - SCANWIND; i < end; i++)
+    {
+        if (i == bestOffs2) continue;    // this offset already calculated, thus skip
+
+        // Calculates correlation value for the mixing position corresponding
+        // to 'i'
+        corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
+        // heuristic rule to slightly favour values close to mid of the range
+        float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
+        corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
+
+        // Checks for the highest correlation value
+        if (corr > bestCorr)
+        {
+            bestCorr = corr;
+            bestOffs = i;
         }
-        corrOffset = bestOffs;
     }
+
     // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
     clearCrossCorrState();
 
+#ifdef SOUNDTOUCH_INTEGER_SAMPLES
+    adaptNormalizer();
+#endif
+
     return bestOffs;
 }
 
 
 
+
+/// For integer algorithm: adapt normalization factor divider with music so that 
+/// it'll not be pessimistically restrictive that can degrade quality on quieter sections
+/// yet won't cause integer overflows either
+void TDStretch::adaptNormalizer()
+{
+    // Do not adapt normalizer over too silent sequences to avoid averaging filter depleting to
+    // too low values during pauses in music
+    if ((maxnorm > 1000) || (maxnormf > 40000000))
+    { 
+        //norm averaging filter
+        maxnormf = 0.9f * maxnormf + 0.1f * (float)maxnorm;
+
+        if ((maxnorm > 800000000) && (overlapDividerBitsNorm < 16))
+        {
+            // large values, so increase divider
+            overlapDividerBitsNorm++;
+            if (maxnorm > 1600000000) overlapDividerBitsNorm++; // extra large value => extra increase
+        }
+        else if ((maxnormf < 1000000) && (overlapDividerBitsNorm > 0))
+        {
+            // extra small values, decrease divider
+            overlapDividerBitsNorm--;
+        }
+    }
+
+    maxnorm = 0;
+}
+
+
 /// clear cross correlation routine state if necessary 
 void TDStretch::clearCrossCorrState()
 {
@@ -417,18 +515,18 @@ void TDStretch::clearCrossCorrState()
 void TDStretch::calcSeqParameters()
 {
     // Adjust tempo param according to tempo, so that variating processing sequence length is used
-    // at varius tempo settings, between the given low...top limits
+    // at various tempo settings, between the given low...top limits
     #define AUTOSEQ_TEMPO_LOW   0.5     // auto setting low tempo range (-50%)
     #define AUTOSEQ_TEMPO_TOP   2.0     // auto setting top tempo range (+100%)
 
     // sequence-ms setting values at above low & top tempo
-    #define AUTOSEQ_AT_MIN      125.0
-    #define AUTOSEQ_AT_MAX      50.0
+    #define AUTOSEQ_AT_MIN      90.0
+    #define AUTOSEQ_AT_MAX      40.0
     #define AUTOSEQ_K           ((AUTOSEQ_AT_MAX - AUTOSEQ_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
     #define AUTOSEQ_C           (AUTOSEQ_AT_MIN - (AUTOSEQ_K) * (AUTOSEQ_TEMPO_LOW))
 
-    // seek-window-ms setting values at above low & top tempo
-    #define AUTOSEEK_AT_MIN     25.0
+    // seek-window-ms setting values at above low & top tempoq
+    #define AUTOSEEK_AT_MIN     20.0
     #define AUTOSEEK_AT_MAX     15.0
     #define AUTOSEEK_K          ((AUTOSEEK_AT_MAX - AUTOSEEK_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
     #define AUTOSEEK_C          (AUTOSEEK_AT_MIN - (AUTOSEEK_K) * (AUTOSEQ_TEMPO_LOW))
@@ -464,7 +562,7 @@ void TDStretch::calcSeqParameters()
 
 // Sets new target tempo. Normal tempo = 'SCALE', smaller values represent slower 
 // tempo, larger faster tempo.
-void TDStretch::setTempo(float newTempo)
+void TDStretch::setTempo(double newTempo)
 {
     int intskip;
 
@@ -475,7 +573,7 @@ void TDStretch::setTempo(float newTempo)
 
     // Calculate ideal skip length (according to tempo value) 
     nominalSkip = tempo * (seekWindowLength - overlapLength);
-    intskip = (int)(nominalSkip + 0.5f);
+    intskip = (int)(nominalSkip + 0.5);
 
     // Calculate how many samples are needed in the 'inputBuffer' to 
     // process another batch of samples
@@ -488,9 +586,8 @@ void TDStretch::setTempo(float newTempo)
 // Sets the number of channels, 1 = mono, 2 = stereo
 void TDStretch::setChannels(int numChannels)
 {
-    assert(numChannels > 0);
-    if (channels == numChannels) return;
-//    assert(numChannels == 1 || numChannels == 2);
+    if (!verifyNumberOfChannels(numChannels) ||
+        (channels == numChannels)) return;
 
     channels = numChannels;
     inputBuffer.setChannels(channels);
@@ -539,7 +636,8 @@ void TDStretch::processNominalTempo()
 // the result into 'outputBuffer'
 void TDStretch::processSamples()
 {
-    int ovlSkip, offset;
+    int ovlSkip;
+    int offset = 0;
     int temp;
 
     /* Removed this small optimization - can introduce a click to sound when tempo setting
@@ -556,35 +654,62 @@ void TDStretch::processSamples()
     // to form a processing frame.
     while ((int)inputBuffer.numSamples() >= sampleReq) 
     {
-        // If tempo differs from the normal ('SCALE'), scan for the best overlapping
-        // position
-        offset = seekBestOverlapPosition(inputBuffer.ptrBegin());
-
-        // Mix the samples in the 'inputBuffer' at position of 'offset' with the 
-        // samples in 'midBuffer' using sliding overlapping
-        // ... first partially overlap with the end of the previous sequence
-        // (that's in 'midBuffer')
-        overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
-        outputBuffer.putSamples((uint)overlapLength);
+        if (isBeginning == false)
+        {
+            // apart from the very beginning of the track, 
+            // scan for the best overlapping position & do overlap-add
+            offset = seekBestOverlapPosition(inputBuffer.ptrBegin());
+
+            // Mix the samples in the 'inputBuffer' at position of 'offset' with the 
+            // samples in 'midBuffer' using sliding overlapping
+            // ... first partially overlap with the end of the previous sequence
+            // (that's in 'midBuffer')
+            overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
+            outputBuffer.putSamples((uint)overlapLength);
+            offset += overlapLength;
+        }
+        else
+        {
+            // Adjust processing offset at beginning of track by not perform initial overlapping
+            // and compensating that in the 'input buffer skip' calculation
+            isBeginning = false;
+            int skip = (int)(tempo * overlapLength + 0.5 * seekLength + 0.5);
+
+            #ifdef ST_SIMD_AVOID_UNALIGNED
+            // in SIMD mode, round the skip amount to value corresponding to aligned memory address
+            if (channels == 1)
+            {
+                skip &= -4;
+            }
+            else if (channels == 2)
+            {
+                skip &= -2;
+            }
+            #endif
+            skipFract -= skip;
+            if (skipFract <= -nominalSkip)
+            {
+                skipFract = -nominalSkip;
+            }
+        }
 
         // ... then copy sequence samples from 'inputBuffer' to output:
 
-        // length of sequence
-        temp = (seekWindowLength - 2 * overlapLength);
-
         // crosscheck that we don't have buffer overflow...
-        if ((int)inputBuffer.numSamples() < (offset + temp + overlapLength * 2))
+        if ((int)inputBuffer.numSamples() < (offset + seekWindowLength - overlapLength))
         {
             continue;    // just in case, shouldn't really happen
         }
 
-        outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp);
+        // length of sequence
+        temp = (seekWindowLength - 2 * overlapLength);
+        outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * offset, (uint)temp);
 
         // Copies the end of the current sequence from 'inputBuffer' to 
         // 'midBuffer' for being mixed with the beginning of the next 
         // processing sequence and so on
-        assert((offset + temp + overlapLength * 2) <= (int)inputBuffer.numSamples());
-        memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp + overlapLength), 
+        assert((offset + temp + overlapLength) <= (int)inputBuffer.numSamples());
+        memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp), 
             channels * sizeof(SAMPLETYPE) * overlapLength);
 
         // Remove the processed samples from the input buffer. Update
@@ -680,7 +805,7 @@ TDStretch * TDStretch::newInstance()
 
 //////////////////////////////////////////////////////////////////////////////
 //
-// Integer arithmetics specific algorithm implementations.
+// Integer arithmetic specific algorithm implementations.
 //
 //////////////////////////////////////////////////////////////////////////////
 
@@ -694,7 +819,7 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const
     short temp;
     int cnt2;
 
-    for (i = 0; i < overlapLength ; i ++) 
+    for (i = 0; i < overlapLength ; i ++)
     {
         temp = (short)(overlapLength - i);
         cnt2 = 2 * i;
@@ -706,21 +831,19 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const
 
 // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Multi'
 // version of the routine.
-void TDStretch::overlapMulti(SAMPLETYPE *poutput, const SAMPLETYPE *input) const
+void TDStretch::overlapMulti(short *poutput, const short *input) const
 {
-    SAMPLETYPE m1=(SAMPLETYPE)0;
-    SAMPLETYPE m2;
-    int i=0;
+    short m1;
+    int i = 0;
 
-    for (m2 = (SAMPLETYPE)overlapLength; m2; m2 --)
+    for (m1 = 0; m1 < overlapLength; m1 ++)
     {
+        short m2 = (short)(overlapLength - m1);
         for (int c = 0; c < channels; c ++)
         {
             poutput[i] = (input[i] * m1 + pMidBuffer[i] * m2)  / overlapLength;
             i++;
         }
-
-        m1++;
     }
 }
 
@@ -743,13 +866,15 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
     // calculate overlap length so that it's power of 2 - thus it's easy to do
     // integer division by right-shifting. Term "-1" at end is to account for 
     // the extra most significatnt bit left unused in result by signed multiplication 
-    overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
-    if (overlapDividerBits > 9) overlapDividerBits = 9;
-    if (overlapDividerBits < 3) overlapDividerBits = 3;
-    newOvl = (int)pow(2.0, (int)overlapDividerBits + 1);    // +1 => account for -1 above
+    overlapDividerBitsPure = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
+    if (overlapDividerBitsPure > 9) overlapDividerBitsPure = 9;
+    if (overlapDividerBitsPure < 3) overlapDividerBitsPure = 3;
+    newOvl = (int)pow(2.0, (int)overlapDividerBitsPure + 1);    // +1 => account for -1 above
 
     acceptNewOverlapLength(newOvl);
 
+    overlapDividerBitsNorm = overlapDividerBitsPure;
+
     // calculate sloping divider so that crosscorrelation operation won't 
     // overflow 32-bit register. Max. sum of the crosscorrelation sum without 
     // divider would be 2^30*(N^3-N)/3, where N = overlap length
@@ -757,28 +882,40 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
 }
 
 
-double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, double &norm) const
+double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, double &norm)
 {
     long corr;
-    long lnorm;
+    unsigned long lnorm;
     int i;
 
+    #ifdef ST_SIMD_AVOID_UNALIGNED
+        // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
+        if (((ulongptr)mixingPos) & 15) return -1e50;
+    #endif
+
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
+
     corr = lnorm = 0;
-    // Same routine for stereo and mono. For stereo, unroll loop for better
-    // efficiency and gives slightly better resolution against rounding. 
-    // For mono it same routine, just  unrolls loop by factor of 4
-    for (i = 0; i < channels * overlapLength; i += 4) 
+    // Same routine for stereo and mono
+    for (i = 0; i < ilength; i += 2)
     {
         corr += (mixingPos[i] * compare[i] + 
-                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits;  // notice: do intermediate division here to avoid integer overflow
-        corr += (mixingPos[i + 2] * compare[i + 2] + 
-                 mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits;
+                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
         lnorm += (mixingPos[i] * mixingPos[i] + 
-                  mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow
-        lnorm += (mixingPos[i + 2] * mixingPos[i + 2] + 
-                  mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBits;
+                  mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm;
+        // do intermediate scalings to avoid integer overflow
     }
 
+    if (lnorm > maxnorm)
+    {
+        // modify 'maxnorm' inside critical section to avoid multi-access conflict if in OpenMP mode
+        #pragma omp critical
+        if (lnorm > maxnorm)
+        {
+            maxnorm = lnorm;
+        }
+    }
     // Normalize result by dividing by sqrt(norm) - this step is easiest 
     // done using floating point operation
     norm = (double)lnorm;
@@ -787,38 +924,42 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
 
 
 /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
-double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm) const
+double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm)
 {
     long corr;
     long lnorm;
     int i;
 
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
+
     // cancel first normalizer tap from previous round
     lnorm = 0;
     for (i = 1; i <= channels; i ++)
     {
-        lnorm -= (mixingPos[-i] * mixingPos[-i]) >> overlapDividerBits;
+        lnorm -= (mixingPos[-i] * mixingPos[-i]) >> overlapDividerBitsNorm;
     }
 
     corr = 0;
-    // Same routine for stereo and mono. For stereo, unroll loop for better
-    // efficiency and gives slightly better resolution against rounding. 
-    // For mono it same routine, just  unrolls loop by factor of 4
-    for (i = 0; i < channels * overlapLength; i += 4) 
+    // Same routine for stereo and mono.
+    for (i = 0; i < ilength; i += 2) 
     {
         corr += (mixingPos[i] * compare[i] + 
-                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits;  // notice: do intermediate division here to avoid integer overflow
-        corr += (mixingPos[i + 2] * compare[i + 2] + 
-                 mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits;
+                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
     }
 
     // update normalizer with last samples of this round
     for (int j = 0; j < channels; j ++)
     {
         i --;
-        lnorm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBits;
+        lnorm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBitsNorm;
     }
+
     norm += (double)lnorm;
+    if (norm > maxnorm)
+    {
+        maxnorm = (unsigned long)norm;
+    }
 
     // Normalize result by dividing by sqrt(norm) - this step is easiest 
     // done using floating point operation
@@ -829,7 +970,7 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c
 
 //////////////////////////////////////////////////////////////////////////////
 //
-// Floating point arithmetics specific algorithm implementations.
+// Floating point arithmetic specific algorithm implementations.
 //
 
 #ifdef SOUNDTOUCH_FLOAT_SAMPLES
@@ -903,29 +1044,26 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
 
 
 /// Calculate cross-correlation
-double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm) const
+double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm)
 {
-    double corr;
-    double norm;
+    float corr;
+    float norm;
     int i;
 
-    corr = norm = 0;
-    // Same routine for stereo and mono. For Stereo, unroll by factor of 2.
-    // For mono it's same routine yet unrollsd by factor of 4.
-    for (i = 0; i < channels * overlapLength; i += 4) 
-    {
-        corr += mixingPos[i] * compare[i] +
-                mixingPos[i + 1] * compare[i + 1];
-
-        norm += mixingPos[i] * mixingPos[i] + 
-                mixingPos[i + 1] * mixingPos[i + 1];
+    #ifdef ST_SIMD_AVOID_UNALIGNED
+        // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
+        if (((ulongptr)mixingPos) & 15) return -1e50;
+    #endif
 
-        // unroll the loop for better CPU efficiency:
-        corr += mixingPos[i + 2] * compare[i + 2] +
-                mixingPos[i + 3] * compare[i + 3];
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
 
-        norm += mixingPos[i + 2] * mixingPos[i + 2] +
-                mixingPos[i + 3] * mixingPos[i + 3];
+    corr = norm = 0;
+    // Same routine for stereo and mono
+    for (i = 0; i < ilength; i ++)
+    {
+        corr += mixingPos[i] * compare[i];
+        norm += mixingPos[i] * mixingPos[i];
     }
 
     anorm = norm;
@@ -934,9 +1072,9 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do
 
 
 /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
-double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm) const
+double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm)
 {
-    double corr;
+    float corr;
     int i;
 
     corr = 0;
@@ -947,14 +1085,13 @@ double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *c
         norm -= mixingPos[-i] * mixingPos[-i];
     }
 
-    // Same routine for stereo and mono. For Stereo, unroll by factor of 2.
-    // For mono it's same routine yet unrollsd by factor of 4.
-    for (i = 0; i < channels * overlapLength; i += 4) 
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
+
+    // Same routine for stereo and mono
+    for (i = 0; i < ilength; i ++)
     {
-        corr += mixingPos[i] * compare[i] +
-                mixingPos[i + 1] * compare[i + 1] +
-                mixingPos[i + 2] * compare[i + 2] +
-                mixingPos[i + 3] * compare[i + 3];
+        corr += mixingPos[i] * compare[i];
     }
 
     // update normalizer with last samples of this round