27 files changed, 780 insertions, 566 deletions
diff --git a/media/libsoundtouch/src/AAFilter.cpp b/media/libsoundtouch/src/AAFilter.cpp
index b98a6d63ee..3fd4e9fe6d 100644
--- a/media/libsoundtouch/src/AAFilter.cpp
+++ b/media/libsoundtouch/src/AAFilter.cpp
@@ -12,13 +12,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2014-01-05 21:40:22 +0000 (Sun, 05 Jan 2014) $
-// File revision : $Revision: 4 $
-//
-// $Id: AAFilter.cpp 177 2014-01-05 21:40:22Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -75,7 +68,6 @@ using namespace soundtouch;
     #define _DEBUG_SAVE_AAFIR_COEFFS(x, y)
 #endif
 
-
 /*****************************************************************************
  *
  * Implementation of the class 'AAFilter'
@@ -90,14 +82,12 @@ AAFilter::AAFilter(uint len)
 }
 
 
-
 AAFilter::~AAFilter()
 {
     delete pFIR;
 }
 
 
-
 // Sets new anti-alias filter cut-off edge frequency, scaled to
 // sampling frequency (nyquist frequency = 0.5).
 // The filter will cut frequencies higher than the given frequency.
@@ -108,7 +98,6 @@ void AAFilter::setCutoffFreq(double newCutoffFreq)
 }
 
 
-
 // Sets number of FIR filter taps
 void AAFilter::setLength(uint newLength)
 {
@@ -117,7 +106,6 @@ void AAFilter::setLength(uint newLength)
 }
 
 
-
 // Calculates coefficients for a low-pass FIR filter using Hamming window
 void AAFilter::calculateCoeffs()
 {
@@ -177,12 +165,10 @@ void AAFilter::calculateCoeffs()
     for (i = 0; i < length; i ++) 
     {
         temp = work[i] * scaleCoeff;
-//#if SOUNDTOUCH_INTEGER_SAMPLES
         // scale & round to nearest integer
         temp += (temp >= 0) ? 0.5 : -0.5;
         // ensure no overfloods
         assert(temp >= -32768 && temp <= 32767);
-//#endif
         coeffs[i] = (SAMPLETYPE)temp;
     }
 
diff --git a/media/libsoundtouch/src/AAFilter.h b/media/libsoundtouch/src/AAFilter.h
index f1b5f2a554..81d836b750 100644
--- a/media/libsoundtouch/src/AAFilter.h
+++ b/media/libsoundtouch/src/AAFilter.h
@@ -13,13 +13,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2014-01-07 19:41:23 +0000 (Tue, 07 Jan 2014) $
-// File revision : $Revision: 4 $
-//
-// $Id: AAFilter.h 187 2014-01-07 19:41:23Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
diff --git a/media/libsoundtouch/src/FIFOSampleBuffer.cpp b/media/libsoundtouch/src/FIFOSampleBuffer.cpp
index 4e75c8a433..ad36875466 100644
--- a/media/libsoundtouch/src/FIFOSampleBuffer.cpp
+++ b/media/libsoundtouch/src/FIFOSampleBuffer.cpp
@@ -15,13 +15,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2012-11-08 18:53:01 +0000 (Thu, 08 Nov 2012) $
-// File revision : $Revision: 4 $
-//
-// $Id: FIFOSampleBuffer.cpp 160 2012-11-08 18:53:01Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -80,7 +73,8 @@ void FIFOSampleBuffer::setChannels(int numChannels)
 {
     uint usedBytes;
 
-    assert(numChannels > 0);
+    if (!verifyNumberOfChannels(numChannels)) return;
+
     usedBytes = channels * samplesInBuffer;
     channels = (uint)numChannels;
     samplesInBuffer = usedBytes / channels;
@@ -131,7 +125,7 @@ void FIFOSampleBuffer::putSamples(uint nSamples)
 //
 // Parameter 'slackCapacity' tells the function how much free capacity (in
 // terms of samples) there _at least_ should be, in order to the caller to
-// succesfully insert all the required samples to the buffer. When necessary, 
+// successfully insert all the required samples to the buffer. When necessary, 
 // the function grows the buffer size to comply with this requirement.
 //
 // When using this function as means for inserting new samples, also remember 
@@ -158,7 +152,7 @@ SAMPLETYPE *FIFOSampleBuffer::ptrBegin()
 }
 
 
-// Ensures that the buffer has enought capacity, i.e. space for _at least_
+// Ensures that the buffer has enough capacity, i.e. space for _at least_
 // 'capacityRequirement' number of samples. The buffer is grown in steps of
 // 4 kilobytes to eliminate the need for frequently growing up the buffer,
 // as well as to round the buffer size up to the virtual memory page size.
@@ -272,3 +266,10 @@ uint FIFOSampleBuffer::adjustAmountOfSamples(uint numSamples)
     return samplesInBuffer;
 }
 
+
+/// Add silence to end of buffer
+void FIFOSampleBuffer::addSilent(uint nSamples)
+{
+    memset(ptrEnd(nSamples), 0, sizeof(SAMPLETYPE) * nSamples * channels);
+    samplesInBuffer += nSamples;
+}
diff --git a/media/libsoundtouch/src/FIFOSampleBuffer.h b/media/libsoundtouch/src/FIFOSampleBuffer.h
index 6d2fd59f65..537a7b8722 100644
--- a/media/libsoundtouch/src/FIFOSampleBuffer.h
+++ b/media/libsoundtouch/src/FIFOSampleBuffer.h
@@ -15,13 +15,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2014-01-05 21:40:22 +0000 (Sun, 05 Jan 2014) $
-// File revision : $Revision: 4 $
-//
-// $Id: FIFOSampleBuffer.h 177 2014-01-05 21:40:22Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -119,7 +112,7 @@ public:
     /// 'putSamples(numSamples)' function.
     SAMPLETYPE *ptrEnd(
                 uint slackCapacity   ///< How much free capacity (in samples) there _at least_ 
-                                     ///< should be so that the caller can succesfully insert the 
+                                     ///< should be so that the caller can successfully insert the 
                                      ///< desired samples to the buffer. If necessary, the function 
                                      ///< grows the buffer size to comply with this requirement.
                 );
@@ -177,6 +170,9 @@ public:
     /// allow trimming (downwards) amount of samples in pipeline.
     /// Returns adjusted amount of samples
     uint adjustAmountOfSamples(uint numSamples);
+
+    /// Add silence to end of buffer
+    void addSilent(uint nSamples);
 };
 
 }
diff --git a/media/libsoundtouch/src/FIFOSamplePipe.h b/media/libsoundtouch/src/FIFOSamplePipe.h
index cd3191c1af..3def42d1ab 100644
--- a/media/libsoundtouch/src/FIFOSamplePipe.h
+++ b/media/libsoundtouch/src/FIFOSamplePipe.h
@@ -17,13 +17,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2012-06-13 19:29:53 +0000 (Wed, 13 Jun 2012) $
-// File revision : $Revision: 4 $
-//
-// $Id: FIFOSamplePipe.h 143 2012-06-13 19:29:53Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -58,6 +51,18 @@ namespace soundtouch
 /// Abstract base class for FIFO (first-in-first-out) sample processing classes.
 class FIFOSamplePipe
 {
+protected:
+
+    bool verifyNumberOfChannels(int nChannels) const
+    {
+        if ((nChannels > 0) && (nChannels <= SOUNDTOUCH_MAX_CHANNELS))
+        {
+            return true;
+        }
+        ST_THROW_RT_ERROR("Error: Illegal number of channels");
+        return false;
+    }
+
 public:
     // virtual default destructor
     virtual ~FIFOSamplePipe() {}
@@ -122,7 +127,6 @@ public:
 };
 
 
-
 /// Base-class for sound processing routines working in FIFO principle. With this base 
 /// class it's easy to implement sound processing stages that can be chained together,
 /// so that samples that are fed into beginning of the pipe automatically go through 
@@ -145,7 +149,6 @@ protected:
         output = pOutput;
     }
 
-
     /// Constructor. Doesn't define output pipe; it has to be set be 
     /// 'setOutPipe' function.
     FIFOProcessor()
@@ -153,7 +156,6 @@ protected:
         output = NULL;
     }
 
-
     /// Constructor. Configures output pipe.
     FIFOProcessor(FIFOSamplePipe *pOutput   ///< Output pipe.
                  )
@@ -161,13 +163,11 @@ protected:
         output = pOutput;
     }
 
-
     /// Destructor.
     virtual ~FIFOProcessor()
     {
     }
 
-
     /// Returns a pointer to the beginning of the output samples. 
     /// This function is provided for accessing the output samples directly. 
     /// Please be careful for not to corrupt the book-keeping!
@@ -194,7 +194,6 @@ public:
         return output->receiveSamples(outBuffer, maxSamples);
     }
 
-
     /// Adjusts book-keeping so that given number of samples are removed from beginning of the 
     /// sample buffer without copying them anywhere. 
     ///
@@ -206,14 +205,12 @@ public:
         return output->receiveSamples(maxSamples);
     }
 
-
     /// Returns number of samples currently available.
     virtual uint numSamples() const
     {
         return output->numSamples();
     }
 
-
     /// Returns nonzero if there aren't any samples available for outputting.
     virtual int isEmpty() const
     {
@@ -226,7 +223,6 @@ public:
     {
         return output->adjustAmountOfSamples(numSamples);
     }
-
 };
 
 }
diff --git a/media/libsoundtouch/src/FIRFilter.cpp b/media/libsoundtouch/src/FIRFilter.cpp
index dc7c4aa0ba..201395bd51 100644
--- a/media/libsoundtouch/src/FIRFilter.cpp
+++ b/media/libsoundtouch/src/FIRFilter.cpp
@@ -2,22 +2,21 @@
 ///
 /// General FIR digital filter routines with MMX optimization. 
 ///
-/// Note : MMX optimized functions reside in a separate, platform-specific file, 
+/// Notes : MMX optimized functions reside in a separate, platform-specific file, 
 /// e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
 ///
+/// This source file contains OpenMP optimizations that allow speeding up the
+/// corss-correlation algorithm by executing it in several threads / CPU cores 
+/// in parallel. See the following article link for more detailed discussion 
+/// about SoundTouch OpenMP optimizations:
+/// http://www.softwarecoven.com/parallel-computing-in-embedded-mobile-devices
+///
 /// Author        : Copyright (c) Olli Parviainen
 /// Author e-mail : oparviai 'at' iki.fi
 /// SoundTouch WWW: http://www.surina.net/soundtouch
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-02-21 21:24:29 +0000 (Sat, 21 Feb 2015) $
-// File revision : $Revision: 4 $
-//
-// $Id: FIRFilter.cpp 202 2015-02-21 21:24:29Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -61,14 +60,17 @@ FIRFilter::FIRFilter()
     length = 0;
     lengthDiv8 = 0;
     filterCoeffs = NULL;
+    filterCoeffsStereo = NULL;
 }
 
 
 FIRFilter::~FIRFilter()
 {
     delete[] filterCoeffs;
+    delete[] filterCoeffsStereo;
 }
 
+
 // Usual C-version of the filter routine for stereo sound
 uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples) const
 {
@@ -78,35 +80,26 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
     // because division is much slower operation than multiplying.
     double dScaler = 1.0 / (double)resultDivider;
 #endif
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = length & -8;
 
-    assert(length != 0);
-    assert(src != NULL);
-    assert(dest != NULL);
-    assert(filterCoeffs != NULL);
+    assert((length != 0) && (length == ilength) && (src != NULL) && (dest != NULL) && (filterCoeffs != NULL));
 
-    end = 2 * (numSamples - length);
+    end = 2 * (numSamples - ilength);
 
     #pragma omp parallel for
     for (j = 0; j < end; j += 2) 
     {
         const SAMPLETYPE *ptr;
         LONG_SAMPLETYPE suml, sumr;
-        uint i;
 
         suml = sumr = 0;
         ptr = src + j;
 
-        for (i = 0; i < length; i += 4) 
+        for (int i = 0; i < ilength; i ++)
         {
-            // loop is unrolled by factor of 4 here for efficiency
-            suml += ptr[2 * i + 0] * filterCoeffs[i + 0] +
-                    ptr[2 * i + 2] * filterCoeffs[i + 1] +
-                    ptr[2 * i + 4] * filterCoeffs[i + 2] +
-                    ptr[2 * i + 6] * filterCoeffs[i + 3];
-            sumr += ptr[2 * i + 1] * filterCoeffs[i + 0] +
-                    ptr[2 * i + 3] * filterCoeffs[i + 1] +
-                    ptr[2 * i + 5] * filterCoeffs[i + 2] +
-                    ptr[2 * i + 7] * filterCoeffs[i + 3];
+            suml += ptr[2 * i] * filterCoeffsStereo[2 * i];
+            sumr += ptr[2 * i + 1] * filterCoeffsStereo[2 * i + 1];
         }
 
 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
@@ -116,19 +109,14 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
         suml = (suml < -32768) ? -32768 : (suml > 32767) ? 32767 : suml;
         // saturate to 16 bit integer limits
         sumr = (sumr < -32768) ? -32768 : (sumr > 32767) ? 32767 : sumr;
-#else
-        suml *= dScaler;
-        sumr *= dScaler;
 #endif // SOUNDTOUCH_INTEGER_SAMPLES
         dest[j] = (SAMPLETYPE)suml;
         dest[j + 1] = (SAMPLETYPE)sumr;
     }
-    return numSamples - length;
+    return numSamples - ilength;
 }
 
 
-
-
 // Usual C-version of the filter routine for mono sound
 uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples) const
 {
@@ -139,31 +127,28 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
     double dScaler = 1.0 / (double)resultDivider;
 #endif
 
-    assert(length != 0);
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = length & -8;
 
-    end = numSamples - length;
+    assert(ilength != 0);
+
+    end = numSamples - ilength;
     #pragma omp parallel for
-    for (j = 0; j < end; j ++) 
+    for (j = 0; j < end; j ++)
     {
         const SAMPLETYPE *pSrc = src + j;
         LONG_SAMPLETYPE sum;
-        uint i;
+        int i;
 
         sum = 0;
-        for (i = 0; i < length; i += 4) 
+        for (i = 0; i < ilength; i ++)
         {
-            // loop is unrolled by factor of 4 here for efficiency
-            sum += pSrc[i + 0] * filterCoeffs[i + 0] + 
-                   pSrc[i + 1] * filterCoeffs[i + 1] + 
-                   pSrc[i + 2] * filterCoeffs[i + 2] + 
-                   pSrc[i + 3] * filterCoeffs[i + 3];
+            sum += pSrc[i] * filterCoeffs[i];
         }
 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
         sum >>= resultDivFactor;
         // saturate to 16 bit integer limits
         sum = (sum < -32768) ? -32768 : (sum > 32767) ? 32767 : sum;
-#else
-        sum *= dScaler;
 #endif // SOUNDTOUCH_INTEGER_SAMPLES
         dest[j] = (SAMPLETYPE)sum;
     }
@@ -187,14 +172,18 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
     assert(filterCoeffs != NULL);
     assert(numChannels < 16);
 
-    end = numChannels * (numSamples - length);
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = length & -8;
+
+    end = numChannels * (numSamples - ilength);
 
     #pragma omp parallel for
     for (j = 0; j < end; j += numChannels)
     {
         const SAMPLETYPE *ptr;
         LONG_SAMPLETYPE sums[16];
-        uint c, i;
+        uint c;
+        int i;
 
         for (c = 0; c < numChannels; c ++)
         {
@@ -203,7 +192,7 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
 
         ptr = src + j;
 
-        for (i = 0; i < length; i ++)
+        for (i = 0; i < ilength; i ++)
         {
             SAMPLETYPE coef=filterCoeffs[i];
             for (c = 0; c < numChannels; c ++)
@@ -217,13 +206,11 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
         {
 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
             sums[c] >>= resultDivFactor;
-#else
-            sums[c] *= dScaler;
 #endif // SOUNDTOUCH_INTEGER_SAMPLES
             dest[j+c] = (SAMPLETYPE)sums[c];
         }
     }
-    return numSamples - length;
+    return numSamples - ilength;
 }
 
 
@@ -235,6 +222,13 @@ void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint u
     assert(newLength > 0);
     if (newLength % 8) ST_THROW_RT_ERROR("FIR filter length not divisible by 8");
 
+    #ifdef SOUNDTOUCH_FLOAT_SAMPLES
+        // scale coefficients already here if using floating samples
+        double scale = 1.0 / resultDivider;
+    #else
+        short scale = 1;
+    #endif
+
     lengthDiv8 = newLength / 8;
     length = lengthDiv8 * 8;
     assert(length == newLength);
@@ -244,7 +238,16 @@ void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint u
 
     delete[] filterCoeffs;
     filterCoeffs = new SAMPLETYPE[length];
-    memcpy(filterCoeffs, coeffs, length * sizeof(SAMPLETYPE));
+    delete[] filterCoeffsStereo;
+    filterCoeffsStereo = new SAMPLETYPE[length*2];
+    for (uint i = 0; i < length; i ++)
+    {
+        filterCoeffs[i] = (SAMPLETYPE)(coeffs[i] * scale);
+        // create also stereo set of filter coefficients: this allows compiler
+        // to autovectorize filter evaluation much more efficiently
+        filterCoeffsStereo[2 * i] = (SAMPLETYPE)(coeffs[i] * scale);
+        filterCoeffsStereo[2 * i + 1] = (SAMPLETYPE)(coeffs[i] * scale);
+    }
 }
 
 
@@ -254,7 +257,6 @@ uint FIRFilter::getLength() const
 }
 
 
-
 // Applies the filter to the given sequence of samples. 
 //
 // Note : The amount of outputted samples is by value of 'filter_length' 
@@ -284,7 +286,6 @@ uint FIRFilter::evaluate(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSample
 }
 
 
-
 // Operator 'new' is overloaded so that it automatically creates a suitable instance 
 // depending on if we've a MMX-capable CPU available or not.
 void * FIRFilter::operator new(size_t s)
diff --git a/media/libsoundtouch/src/FIRFilter.h b/media/libsoundtouch/src/FIRFilter.h
index 70ba97cfe5..39c2cc7542 100644
--- a/media/libsoundtouch/src/FIRFilter.h
+++ b/media/libsoundtouch/src/FIRFilter.h
@@ -11,13 +11,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-02-21 21:24:29 +0000 (Sat, 21 Feb 2015) $
-// File revision : $Revision: 4 $
-//
-// $Id: FIRFilter.h 202 2015-02-21 21:24:29Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -64,6 +57,7 @@ protected:
 
     // Memory for filter coefficients
     SAMPLETYPE *filterCoeffs;
+    SAMPLETYPE *filterCoeffsStereo;
 
     virtual uint evaluateFilterStereo(SAMPLETYPE *dest, 
                                       const SAMPLETYPE *src, 
diff --git a/media/libsoundtouch/src/InterpolateCubic.cpp b/media/libsoundtouch/src/InterpolateCubic.cpp
index 8aa7374c74..b37b0fa801 100644
--- a/media/libsoundtouch/src/InterpolateCubic.cpp
+++ b/media/libsoundtouch/src/InterpolateCubic.cpp
@@ -8,10 +8,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// $Id: InterpolateCubic.cpp 179 2014-01-06 18:41:42Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
diff --git a/media/libsoundtouch/src/InterpolateCubic.h b/media/libsoundtouch/src/InterpolateCubic.h
index e0e302b233..481abd64bc 100644
--- a/media/libsoundtouch/src/InterpolateCubic.h
+++ b/media/libsoundtouch/src/InterpolateCubic.h
@@ -8,10 +8,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// $Id: InterpolateCubic.h 179 2014-01-06 18:41:42Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -45,7 +41,6 @@ namespace soundtouch
 class InterpolateCubic : public TransposerBase
 {
 protected:
-    virtual void resetRegisters();
     virtual int transposeMono(SAMPLETYPE *dest, 
                         const SAMPLETYPE *src, 
                         int &srcSamples);
@@ -56,10 +51,17 @@ protected:
                         const SAMPLETYPE *src, 
                         int &srcSamples);
 
-    float fract;
+    double fract;
 
 public:
     InterpolateCubic();
+
+    virtual void resetRegisters();
+
+    int getLatency() const
+    {
+        return 1;
+    }
 };
 
 }
diff --git a/media/libsoundtouch/src/InterpolateLinear.cpp b/media/libsoundtouch/src/InterpolateLinear.cpp
index ae26e69a1e..9533e79b79 100644
--- a/media/libsoundtouch/src/InterpolateLinear.cpp
+++ b/media/libsoundtouch/src/InterpolateLinear.cpp
@@ -8,10 +8,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// $Id: InterpolateLinear.cpp 180 2014-01-06 19:16:02Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -146,7 +142,7 @@ int InterpolateLinearInteger::transposeMulti(SAMPLETYPE *dest, const SAMPLETYPE
         LONG_SAMPLETYPE temp, vol1;
     
         assert(iFract < SCALE);
-        vol1 = (SCALE - iFract);
+        vol1 = (LONG_SAMPLETYPE)(SCALE - iFract);
         for (int c = 0; c < numChannels; c ++)
         {
             temp = vol1 * src[c] + iFract * src[c + numChannels];
@@ -170,9 +166,9 @@ int InterpolateLinearInteger::transposeMulti(SAMPLETYPE *dest, const SAMPLETYPE
 
 // Sets new target iRate. Normal iRate = 1.0, smaller values represent slower 
 // iRate, larger faster iRates.
-void InterpolateLinearInteger::setRate(float newRate)
+void InterpolateLinearInteger::setRate(double newRate)
 {
-    iRate = (int)(newRate * SCALE + 0.5f);
+    iRate = (int)(newRate * SCALE + 0.5);
     TransposerBase::setRate(newRate);
 }
 
@@ -190,7 +186,7 @@ InterpolateLinearFloat::InterpolateLinearFloat() : TransposerBase()
     // Notice: use local function calling syntax for sake of clarity, 
     // to indicate the fact that C++ constructor can't call virtual functions.
     resetRegisters();
-    setRate(1.0f);
+    setRate(1.0);
 }
 
 
@@ -275,12 +271,13 @@ int InterpolateLinearFloat::transposeMulti(SAMPLETYPE *dest, const SAMPLETYPE *s
     i = 0;
     while (srcCount < srcSampleEnd)
     {
-        float temp, vol1;
+        float temp, vol1, fract_float;
     
-        vol1 = (1.0f- fract);
+        vol1 = (float)(1.0 - fract);
+		fract_float = (float)fract;
         for (int c = 0; c < numChannels; c ++)
         {
-            temp = vol1 * src[c] + fract * src[c + numChannels];
+			temp = vol1 * src[c] + fract_float * src[c + numChannels];
             *dest = (SAMPLETYPE)temp;
             dest ++;
         }
diff --git a/media/libsoundtouch/src/InterpolateLinear.h b/media/libsoundtouch/src/InterpolateLinear.h
index b76299f889..ff362e84b1 100644
--- a/media/libsoundtouch/src/InterpolateLinear.h
+++ b/media/libsoundtouch/src/InterpolateLinear.h
@@ -8,10 +8,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// $Id: InterpolateLinear.h 179 2014-01-06 18:41:42Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -42,15 +38,13 @@
 namespace soundtouch
 {
 
-/// Linear transposer class that uses integer arithmetics
+/// Linear transposer class that uses integer arithmetic
 class InterpolateLinearInteger : public TransposerBase
 {
 protected:
     int iFract;
     int iRate;
 
-    virtual void resetRegisters();
-
     virtual int transposeMono(SAMPLETYPE *dest, 
                        const SAMPLETYPE *src, 
                        int &srcSamples);
@@ -63,17 +57,22 @@ public:
 
     /// Sets new target rate. Normal rate = 1.0, smaller values represent slower 
     /// rate, larger faster rates.
-    virtual void setRate(float newRate);
+    virtual void setRate(double newRate);
+
+    virtual void resetRegisters();
+
+    int getLatency() const
+    {
+        return 0;
+    }
 };
 
 
-/// Linear transposer class that uses floating point arithmetics
+/// Linear transposer class that uses floating point arithmetic
 class InterpolateLinearFloat : public TransposerBase
 {
 protected:
-    float fract;
-
-    virtual void resetRegisters();
+    double fract;
 
     virtual int transposeMono(SAMPLETYPE *dest, 
                        const SAMPLETYPE *src, 
@@ -85,6 +84,13 @@ protected:
 
 public:
     InterpolateLinearFloat();
+
+    virtual void resetRegisters();
+
+    int getLatency() const
+    {
+        return 0;
+    }
 };
 
 }
diff --git a/media/libsoundtouch/src/InterpolateShannon.cpp b/media/libsoundtouch/src/InterpolateShannon.cpp
index 1085fd14cb..975d872ad6 100644
--- a/media/libsoundtouch/src/InterpolateShannon.cpp
+++ b/media/libsoundtouch/src/InterpolateShannon.cpp
@@ -13,10 +13,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// $Id: InterpolateShannon.cpp 195 2014-04-06 15:57:21Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
diff --git a/media/libsoundtouch/src/InterpolateShannon.h b/media/libsoundtouch/src/InterpolateShannon.h
index 701640f7db..72ab0b526d 100644
--- a/media/libsoundtouch/src/InterpolateShannon.h
+++ b/media/libsoundtouch/src/InterpolateShannon.h
@@ -13,10 +13,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// $Id: InterpolateShannon.h 179 2014-01-06 18:41:42Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -50,7 +46,6 @@ namespace soundtouch
 class InterpolateShannon : public TransposerBase
 {
 protected:
-    void resetRegisters();
     int transposeMono(SAMPLETYPE *dest, 
                         const SAMPLETYPE *src, 
                         int &srcSamples);
@@ -61,10 +56,17 @@ protected:
                         const SAMPLETYPE *src, 
                         int &srcSamples);
 
-    float fract;
+    double fract;
 
 public:
     InterpolateShannon();
+
+    void resetRegisters();
+
+    int getLatency() const
+    {
+        return 3;
+    }
 };
 
 }
diff --git a/media/libsoundtouch/src/RateTransposer.cpp b/media/libsoundtouch/src/RateTransposer.cpp
index f1e3fd043b..4c202391e0 100644
--- a/media/libsoundtouch/src/RateTransposer.cpp
+++ b/media/libsoundtouch/src/RateTransposer.cpp
@@ -10,13 +10,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2014-04-06 15:57:21 +0000 (Sun, 06 Apr 2014) $
-// File revision : $Revision: 4 $
-//
-// $Id: RateTransposer.cpp 195 2014-04-06 15:57:21Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -57,15 +50,21 @@ TransposerBase::ALGORITHM TransposerBase::algorithm = TransposerBase::CUBIC;
 // Constructor
 RateTransposer::RateTransposer() : FIFOProcessor(&outputBuffer)
 {
-    bUseAAFilter = true;
+    bUseAAFilter = 
+#ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER
+        true;
+#else
+        // Disable Anti-alias filter if desirable to avoid click at rate change zero value crossover
+        false;
+#endif
 
     // Instantiates the anti-alias filter
     pAAFilter = new AAFilter(64);
     pTransposer = TransposerBase::newInstance();
+    clear();
 }
 
 
-
 RateTransposer::~RateTransposer()
 {
     delete pAAFilter;
@@ -73,11 +72,14 @@ RateTransposer::~RateTransposer()
 }
 
 
-
 /// Enables/disables the anti-alias filter. Zero to disable, nonzero to enable
 void RateTransposer::enableAAFilter(bool newMode)
 {
+#ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER
+    // Disable Anti-alias filter if desirable to avoid click at rate change zero value crossover
     bUseAAFilter = newMode;
+    clear();
+#endif
 }
 
 
@@ -94,23 +96,22 @@ AAFilter *RateTransposer::getAAFilter()
 }
 
 
-
 // Sets new target iRate. Normal iRate = 1.0, smaller values represent slower 
 // iRate, larger faster iRates.
-void RateTransposer::setRate(float newRate)
+void RateTransposer::setRate(double newRate)
 {
     double fCutoff;
 
     pTransposer->setRate(newRate);
 
     // design a new anti-alias filter
-    if (newRate > 1.0f) 
+    if (newRate > 1.0) 
     {
-        fCutoff = 0.5f / newRate;
+        fCutoff = 0.5 / newRate;
     } 
     else 
     {
-        fCutoff = 0.5f * newRate;
+        fCutoff = 0.5 * newRate;
     }
     pAAFilter->setCutoffFreq(fCutoff);
 }
@@ -177,11 +178,10 @@ void RateTransposer::processSamples(const SAMPLETYPE *src, uint nSamples)
 // Sets the number of channels, 1 = mono, 2 = stereo
 void RateTransposer::setChannels(int nChannels)
 {
-    assert(nChannels > 0);
+    if (!verifyNumberOfChannels(nChannels) ||
+        (pTransposer->numChannels == nChannels)) return;
 
-    if (pTransposer->numChannels == nChannels) return;
     pTransposer->setChannels(nChannels);
-
     inputBuffer.setChannels(nChannels);
     midBuffer.setChannels(nChannels);
     outputBuffer.setChannels(nChannels);
@@ -194,6 +194,11 @@ void RateTransposer::clear()
     outputBuffer.clear();
     midBuffer.clear();
     inputBuffer.clear();
+    pTransposer->resetRegisters();
+
+    // prefill buffer to avoid losing first samples at beginning of stream
+    int prefill = getLatency();
+    inputBuffer.addSilent(prefill);
 }
 
 
@@ -208,6 +213,14 @@ int RateTransposer::isEmpty() const
 }
 
 
+/// Return approximate initial input-output latency
+int RateTransposer::getLatency() const
+{
+    return pTransposer->getLatency() +
+        ((bUseAAFilter) ? (pAAFilter->getLength() / 2) : 0);
+}
+
+
 //////////////////////////////////////////////////////////////////////////////
 //
 // TransposerBase - Base class for interpolation
@@ -225,7 +238,7 @@ void TransposerBase::setAlgorithm(TransposerBase::ALGORITHM a)
 int TransposerBase::transpose(FIFOSampleBuffer &dest, FIFOSampleBuffer &src)
 {
     int numSrcSamples = src.numSamples();
-    int sizeDemand = (int)((float)numSrcSamples / rate) + 8;
+    int sizeDemand = (int)((double)numSrcSamples / rate) + 8;
     int numOutput;
     SAMPLETYPE *psrc = src.ptrBegin();
     SAMPLETYPE *pdest = dest.ptrEnd(sizeDemand);
@@ -270,7 +283,7 @@ void TransposerBase::setChannels(int channels)
 }
 
 
-void TransposerBase::setRate(float newRate)
+void TransposerBase::setRate(double newRate)
 {
     rate = newRate;
 }
@@ -280,7 +293,7 @@ void TransposerBase::setRate(float newRate)
 TransposerBase *TransposerBase::newInstance()
 {
 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
-    // Notice: For integer arithmetics support only linear algorithm (due to simplest calculus)
+    // Notice: For integer arithmetic support only linear algorithm (due to simplest calculus)
     return ::new InterpolateLinearInteger;
 #else
     switch (algorithm)
diff --git a/media/libsoundtouch/src/RateTransposer.h b/media/libsoundtouch/src/RateTransposer.h
index fad6d65718..59381fab5f 100644
--- a/media/libsoundtouch/src/RateTransposer.h
+++ b/media/libsoundtouch/src/RateTransposer.h
@@ -14,13 +14,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2014-04-06 15:57:21 +0000 (Sun, 06 Apr 2014) $
-// File revision : $Revision: 4 $
-//
-// $Id: RateTransposer.h 195 2014-04-06 15:57:21Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -66,8 +59,6 @@ public:
     };
 
 protected:
-    virtual void resetRegisters() = 0;
-
     virtual int transposeMono(SAMPLETYPE *dest, 
                         const SAMPLETYPE *src, 
                         int &srcSamples)  = 0;
@@ -81,15 +72,18 @@ protected:
     static ALGORITHM algorithm;
 
 public:
-    float rate;
+    double rate;
     int numChannels;
 
     TransposerBase();
     virtual ~TransposerBase();
 
     virtual int transpose(FIFOSampleBuffer &dest, FIFOSampleBuffer &src);
-    virtual void setRate(float newRate);
+    virtual void setRate(double newRate);
     virtual void setChannels(int channels);
+    virtual int getLatency() const = 0;
+
+    virtual void resetRegisters() = 0;
 
     // static factory function
     static TransposerBase *newInstance();
@@ -132,21 +126,9 @@ public:
     RateTransposer();
     virtual ~RateTransposer();
 
-    /// Operator 'new' is overloaded so that it automatically creates a suitable instance 
-    /// depending on if we're to use integer or floating point arithmetics.
-//    static void *operator new(size_t s);
-
-    /// Use this function instead of "new" operator to create a new instance of this class. 
-    /// This function automatically chooses a correct implementation, depending on if 
-    /// integer ot floating point arithmetics are to be used.
-//    static RateTransposer *newInstance();
-
     /// Returns the output buffer object
     FIFOSamplePipe *getOutput() { return &outputBuffer; };
 
-    /// Returns the store buffer object
-//    FIFOSamplePipe *getStore() { return &storeBuffer; };
-
     /// Return anti-alias filter object
     AAFilter *getAAFilter();
 
@@ -158,7 +140,7 @@ public:
 
     /// Sets new target rate. Normal rate = 1.0, smaller values represent slower 
     /// rate, larger faster rates.
-    virtual void setRate(float newRate);
+    virtual void setRate(double newRate);
 
     /// Sets the number of channels, 1 = mono, 2 = stereo
     void setChannels(int channels);
@@ -172,6 +154,9 @@ public:
 
     /// Returns nonzero if there aren't any samples available for outputting.
     int isEmpty() const;
+
+    /// Return approximate initial input-output latency
+    int getLatency() const;
 };
 
 }
diff --git a/media/libsoundtouch/src/STTypes.h b/media/libsoundtouch/src/STTypes.h
index ddd56d8080..c823fcfd23 100644
--- a/media/libsoundtouch/src/STTypes.h
+++ b/media/libsoundtouch/src/STTypes.h
@@ -8,13 +8,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-05-18 15:25:07 +0000 (Mon, 18 May 2015) $
-// File revision : $Revision: 3 $
-//
-// $Id: STTypes.h 215 2015-05-18 15:25:07Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -57,13 +50,20 @@ typedef unsigned long   ulong;
 #include "soundtouch_config.h"
 
 #if defined(WIN32)
-#define EXPORT __declspec(dllexport)
+#if defined(BUILDING_SOUNDTOUCH)
+#define SOUNDTOUCH_API __declspec(dllexport)
+#else
+#define SOUNDTOUCH_API __declspec(dllimport)
+#endif
 #else
-#define EXPORT
+#define SOUNDTOUCH_API
 #endif
 
 namespace soundtouch
 {
+    /// Max allowed number of channels
+    #define SOUNDTOUCH_MAX_CHANNELS     16
+
     /// Activate these undef's to overrule the possible sampletype 
     /// setting inherited from some other header file:
     //#undef SOUNDTOUCH_INTEGER_SAMPLES
@@ -126,10 +126,10 @@ namespace soundtouch
 
     #endif
 
-    // If defined, allows the SIMD-optimized routines to take minor shortcuts 
-    // for improved performance. Undefine to require faithfully similar SIMD 
-    // calculations as in normal C implementation.
-    #define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION    1
+    // If defined, allows the SIMD-optimized routines to skip unevenly aligned
+    // memory offsets that can cause performance penalty in some SIMD implementations.
+    // Causes slight compromise in sound quality.
+    // #define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION    1
 
 
     #ifdef SOUNDTOUCH_INTEGER_SAMPLES
@@ -144,16 +144,19 @@ namespace soundtouch
         #endif // SOUNDTOUCH_FLOAT_SAMPLES
 
         #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
-            // Allow MMX optimizations
-            #define SOUNDTOUCH_ALLOW_MMX   1
+            // Allow MMX optimizations (not available in X64 mode)
+            #if (!_M_X64)
+                #define SOUNDTOUCH_ALLOW_MMX   1
+            #endif
         #endif
 
     #else
 
         // floating point samples
         typedef float  SAMPLETYPE;
-        // data type for sample accumulation: Use double to utilize full precision.
-        typedef double LONG_SAMPLETYPE;
+        // data type for sample accumulation: Use float also here to enable
+        // efficient autovectorization
+        typedef float LONG_SAMPLETYPE;
 
         #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
             // Allow SSE optimizations
@@ -162,10 +165,16 @@ namespace soundtouch
 
     #endif  // SOUNDTOUCH_INTEGER_SAMPLES
 
-};
+    #if ((SOUNDTOUCH_ALLOW_SSE) || (__SSE__) || (SOUNDTOUCH_USE_NEON))
+        #if SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
+            #define ST_SIMD_AVOID_UNALIGNED
+        #endif
+    #endif
+
+}
 
 // define ST_NO_EXCEPTION_HANDLING switch to disable throwing std exceptions:
-// #define ST_NO_EXCEPTION_HANDLING    1
+#define ST_NO_EXCEPTION_HANDLING    1
 #ifdef ST_NO_EXCEPTION_HANDLING
     // Exceptions disabled. Throw asserts instead if enabled.
     #include <assert.h>
diff --git a/media/libsoundtouch/src/SoundTouch.cpp b/media/libsoundtouch/src/SoundTouch.cpp
index 955818810b..69fba8b9b5 100644
--- a/media/libsoundtouch/src/SoundTouch.cpp
+++ b/media/libsoundtouch/src/SoundTouch.cpp
@@ -41,13 +41,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2014-10-08 15:26:57 +0000 (Wed, 08 Oct 2014) $
-// File revision : $Revision: 4 $
-//
-// $Id: SoundTouch.cpp 201 2014-10-08 15:26:57Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -110,12 +103,14 @@ SoundTouch::SoundTouch()
 
     calcEffectiveRateAndTempo();
 
+    samplesExpectedOut = 0;
+    samplesOutput = 0;
+
     channels = 0;
     bSrateSet = false;
 }
 
 
-
 SoundTouch::~SoundTouch()
 {
     delete pRateTransposer;
@@ -123,7 +118,6 @@ SoundTouch::~SoundTouch()
 }
 
 
-
 /// Get SoundTouch library version string
 const char *SoundTouch::getVersionString()
 {
@@ -143,90 +137,79 @@ uint SoundTouch::getVersionId()
 // Sets the number of channels, 1 = mono, 2 = stereo
 void SoundTouch::setChannels(uint numChannels)
 {
-    /*if (numChannels != 1 && numChannels != 2) 
-    {
-        //ST_THROW_RT_ERROR("Illegal number of channels");
-		return;
-    }*/
+    if (!verifyNumberOfChannels(numChannels)) return;
+
     channels = numChannels;
     pRateTransposer->setChannels((int)numChannels);
     pTDStretch->setChannels((int)numChannels);
 }
 
 
-
 // Sets new rate control value. Normal rate = 1.0, smaller values
 // represent slower rate, larger faster rates.
-void SoundTouch::setRate(float newRate)
+void SoundTouch::setRate(double newRate)
 {
     virtualRate = newRate;
     calcEffectiveRateAndTempo();
 }
 
 
-
 // Sets new rate control value as a difference in percents compared
 // to the original rate (-50 .. +100 %)
-void SoundTouch::setRateChange(float newRate)
+void SoundTouch::setRateChange(double newRate)
 {
-    virtualRate = 1.0f + 0.01f * newRate;
+    virtualRate = 1.0 + 0.01 * newRate;
     calcEffectiveRateAndTempo();
 }
 
 
-
 // Sets new tempo control value. Normal tempo = 1.0, smaller values
 // represent slower tempo, larger faster tempo.
-void SoundTouch::setTempo(float newTempo)
+void SoundTouch::setTempo(double newTempo)
 {
     virtualTempo = newTempo;
     calcEffectiveRateAndTempo();
 }
 
 
-
 // Sets new tempo control value as a difference in percents compared
 // to the original tempo (-50 .. +100 %)
-void SoundTouch::setTempoChange(float newTempo)
+void SoundTouch::setTempoChange(double newTempo)
 {
-    virtualTempo = 1.0f + 0.01f * newTempo;
+    virtualTempo = 1.0 + 0.01 * newTempo;
     calcEffectiveRateAndTempo();
 }
 
 
-
 // Sets new pitch control value. Original pitch = 1.0, smaller values
 // represent lower pitches, larger values higher pitch.
-void SoundTouch::setPitch(float newPitch)
+void SoundTouch::setPitch(double newPitch)
 {
     virtualPitch = newPitch;
     calcEffectiveRateAndTempo();
 }
 
 
-
 // Sets pitch change in octaves compared to the original pitch
 // (-1.00 .. +1.00)
-void SoundTouch::setPitchOctaves(float newPitch)
+void SoundTouch::setPitchOctaves(double newPitch)
 {
-    virtualPitch = (float)exp(0.69314718056f * newPitch);
+    virtualPitch = exp(0.69314718056 * newPitch);
     calcEffectiveRateAndTempo();
 }
 
 
-
 // Sets pitch change in semi-tones compared to the original pitch
 // (-12 .. +12)
 void SoundTouch::setPitchSemiTones(int newPitch)
 {
-    setPitchOctaves((float)newPitch / 12.0f);
+    setPitchOctaves((double)newPitch / 12.0);
 }
 
 
-
-void SoundTouch::setPitchSemiTones(float newPitch)
+void SoundTouch::setPitchSemiTones(double newPitch)
 {
-    setPitchOctaves(newPitch / 12.0f);
+    setPitchOctaves(newPitch / 12.0);
 }
 
 
@@ -234,8 +217,8 @@ void SoundTouch::setPitchSemiTones(float newPitch)
 // nominal control values.
 void SoundTouch::calcEffectiveRateAndTempo()
 {
-    float oldTempo = tempo;
-    float oldRate = rate;
+    double oldTempo = tempo;
+    double oldRate = rate;
 
     tempo = virtualTempo / virtualPitch;
     rate = virtualPitch * virtualRate;
@@ -302,23 +285,12 @@ void SoundTouch::putSamples(const SAMPLETYPE *samples, uint nSamples)
         ST_THROW_RT_ERROR("SoundTouch : Number of channels not defined");
     }
 
-    // Transpose the rate of the new samples if necessary
-    /* Bypass the nominal setting - can introduce a click in sound when tempo/pitch control crosses the nominal value...
-    if (rate == 1.0f) 
-    {
-        // The rate value is same as the original, simply evaluate the tempo changer. 
-        assert(output == pTDStretch);
-        if (pRateTransposer->isEmpty() == 0) 
-        {
-            // yet flush the last samples in the pitch transposer buffer
-            // (may happen if 'rate' changes from a non-zero value to zero)
-            pTDStretch->moveSamples(*pRateTransposer);
-        }
-        pTDStretch->putSamples(samples, nSamples);
-    } 
-    */
+    // accumulate how many samples are expected out from processing, given the current 
+    // processing setting
+    samplesExpectedOut += (double)nSamples / ((double)rate * (double)tempo);
+
 #ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER
-    else if (rate <= 1.0f) 
+    if (rate <= 1.0f) 
     {
         // transpose the rate down, output the transposed sound to tempo changer buffer
         assert(output == pTDStretch);
@@ -346,44 +318,30 @@ void SoundTouch::putSamples(const SAMPLETYPE *samples, uint nSamples)
 void SoundTouch::flush()
 {
     int i;
-    int nUnprocessed;
-    int nOut;
-    SAMPLETYPE *buff = new SAMPLETYPE[64 * channels];
-    
-    // check how many samples still await processing, and scale
-    // that by tempo & rate to get expected output sample count
-    nUnprocessed = numUnprocessedSamples();
-    nUnprocessed = (int)((double)nUnprocessed / (tempo * rate) + 0.5);
+    int numStillExpected;
+    SAMPLETYPE *buff = new SAMPLETYPE[128 * channels];
 
-    nOut = numSamples();        // ready samples currently in buffer ...
-    nOut += nUnprocessed;       // ... and how many we expect there to be in the end
-    
-    memset(buff, 0, 64 * channels * sizeof(SAMPLETYPE));
+    // how many samples are still expected to output
+    numStillExpected = (int)((long)(samplesExpectedOut + 0.5) - samplesOutput);
+    if (numStillExpected < 0) numStillExpected = 0;
+
+    memset(buff, 0, 128 * channels * sizeof(SAMPLETYPE));
     // "Push" the last active samples out from the processing pipeline by
     // feeding blank samples into the processing pipeline until new, 
     // processed samples appear in the output (not however, more than 
-    // 8ksamples in any case)
-    for (i = 0; i < 128; i ++) 
+    // 24ksamples in any case)
+    for (i = 0; (numStillExpected > (int)numSamples()) && (i < 200); i ++)
     {
-        putSamples(buff, 64);
-        if ((int)numSamples() >= nOut) 
-        {
-            // Enough new samples have appeared into the output!
-            // As samples come from processing with bigger chunks, now truncate it
-            // back to maximum "nOut" samples to improve duration accuracy 
-            adjustAmountOfSamples(nOut);
-
-            // finish
-            break;  
-        }
+        putSamples(buff, 128);
     }
 
+    adjustAmountOfSamples(numStillExpected);
+
     delete[] buff;
 
-    // Clear working buffers
-    pRateTransposer->clear();
+    // Clear input buffers
     pTDStretch->clearInput();
-    // yet leave the 'tempoChanger' output intouched as that's where the
+    // yet leave the output intouched as that's where the
     // flushed samples are!
 }
 
@@ -452,7 +410,7 @@ int SoundTouch::getSetting(int settingId) const
             return pRateTransposer->getAAFilter()->getLength();
 
         case SETTING_USE_QUICKSEEK :
-            return (uint)   pTDStretch->isQuickSeekEnabled();
+            return (uint)pTDStretch->isQuickSeekEnabled();
 
         case SETTING_SEQUENCE_MS:
             pTDStretch->getParameters(NULL, &temp, NULL, NULL);
@@ -466,13 +424,53 @@ int SoundTouch::getSetting(int settingId) const
             pTDStretch->getParameters(NULL, NULL, NULL, &temp);
             return temp;
 
-		case SETTING_NOMINAL_INPUT_SEQUENCE :
-			return pTDStretch->getInputSampleReq();
+        case SETTING_NOMINAL_INPUT_SEQUENCE :
+        {
+            int size = pTDStretch->getInputSampleReq();
+
+#ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER
+            if (rate <= 1.0)
+            {
+                // transposing done before timestretch, which impacts latency
+                return (int)(size * rate + 0.5);
+            }
+#endif
+            return size;
+        }
+
+        case SETTING_NOMINAL_OUTPUT_SEQUENCE :
+        {
+            int size = pTDStretch->getOutputBatchSize();
+
+            if (rate > 1.0)
+            {
+                // transposing done after timestretch, which impacts latency
+                return (int)(size / rate + 0.5);
+            }
+            return size;
+        }
+
+        case SETTING_INITIAL_LATENCY:
+        {
+            double latency = pTDStretch->getLatency();
+            int latency_tr = pRateTransposer->getLatency();
+
+#ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER
+            if (rate <= 1.0)
+            {
+                // transposing done before timestretch, which impacts latency
+                latency = (latency + latency_tr) * rate;
+            }
+            else
+#endif
+            {
+                latency += (double)latency_tr / rate;
+            }
 
-		case SETTING_NOMINAL_OUTPUT_SEQUENCE :
-			return pTDStretch->getOutputBatchSize();
+            return (int)(latency + 0.5);
+        }
 
-		default :
+        default :
             return 0;
     }
 }
@@ -482,12 +480,13 @@ int SoundTouch::getSetting(int settingId) const
 // buffers.
 void SoundTouch::clear()
 {
+    samplesExpectedOut = 0;
+    samplesOutput = 0;
     pRateTransposer->clear();
     pTDStretch->clear();
 }
 
 
-
 /// Returns number of samples currently unprocessed.
 uint SoundTouch::numUnprocessedSamples() const
 {
@@ -502,3 +501,38 @@ uint SoundTouch::numUnprocessedSamples() const
     }
     return 0;
 }
+
+
+/// Output samples from beginning of the sample buffer. Copies requested samples to 
+/// output buffer and removes them from the sample buffer. If there are less than 
+/// 'numsample' samples in the buffer, returns all that available.
+///
+/// \return Number of samples returned.
+uint SoundTouch::receiveSamples(SAMPLETYPE *output, uint maxSamples)
+{
+    uint ret = FIFOProcessor::receiveSamples(output, maxSamples);
+    samplesOutput += (long)ret;
+    return ret;
+}
+
+
+/// Adjusts book-keeping so that given number of samples are removed from beginning of the 
+/// sample buffer without copying them anywhere. 
+///
+/// Used to reduce the number of samples in the buffer when accessing the sample buffer directly
+/// with 'ptrBegin' function.
+uint SoundTouch::receiveSamples(uint maxSamples)
+{
+    uint ret = FIFOProcessor::receiveSamples(maxSamples);
+    samplesOutput += (long)ret;
+    return ret;
+}
+
+
+/// Get ratio between input and output audio durations, useful for calculating
+/// processed output duration: if you'll process a stream of N samples, then 
+/// you can expect to get out N * getInputOutputSampleRatio() samples.
+double SoundTouch::getInputOutputSampleRatio()
+{
+    return 1.0 / (tempo * rate);
+}
diff --git a/media/libsoundtouch/src/SoundTouch.h b/media/libsoundtouch/src/SoundTouch.h
index 0581926b49..c8a48940ea 100644
--- a/media/libsoundtouch/src/SoundTouch.h
+++ b/media/libsoundtouch/src/SoundTouch.h
@@ -41,13 +41,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-05-18 15:28:41 +0000 (Mon, 18 May 2015) $
-// File revision : $Revision: 4 $
-//
-// $Id: SoundTouch.h 216 2015-05-18 15:28:41Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -79,10 +72,10 @@ namespace soundtouch
 {
 
 /// Soundtouch library version string
-#define SOUNDTOUCH_VERSION          "1.9.0"
+#define SOUNDTOUCH_VERSION          "2.3.1"
 
 /// SoundTouch library version id
-#define SOUNDTOUCH_VERSION_ID       (10900)
+#define SOUNDTOUCH_VERSION_ID       (20301)
 
 //
 // Available setting IDs for the 'setSetting' & 'get_setting' functions:
@@ -116,32 +109,63 @@ namespace soundtouch
 #define SETTING_OVERLAP_MS          5
 
 
-/// Call "getSetting" with this ID to query nominal average processing sequence
-/// size in samples. This value tells approcimate value how many input samples 
-/// SoundTouch needs to gather before it does DSP processing run for the sample batch.
+/// Call "getSetting" with this ID to query processing sequence size in samples. 
+/// This value gives approximate value of how many input samples you'll need to 
+/// feed into SoundTouch after initial buffering to get out a new batch of
+/// output samples. 
+///
+/// This value does not include initial buffering at beginning of a new processing 
+/// stream, use SETTING_INITIAL_LATENCY to get the initial buffering size.
 ///
 /// Notices: 
 /// - This is read-only parameter, i.e. setSetting ignores this parameter
-/// - Returned value is approximate average value, exact processing batch
-///   size may wary from time to time
-/// - This parameter value is not constant but may change depending on 
+/// - This parameter value is not constant but change depending on 
 ///   tempo/pitch/rate/samplerate settings.
-#define SETTING_NOMINAL_INPUT_SEQUENCE		6
+#define SETTING_NOMINAL_INPUT_SEQUENCE      6
 
 
 /// Call "getSetting" with this ID to query nominal average processing output 
 /// size in samples. This value tells approcimate value how many output samples 
 /// SoundTouch outputs once it does DSP processing run for a batch of input samples.
-///	
+///
 /// Notices: 
 /// - This is read-only parameter, i.e. setSetting ignores this parameter
-/// - Returned value is approximate average value, exact processing batch
-///   size may wary from time to time
-/// - This parameter value is not constant but may change depending on 
+/// - This parameter value is not constant but change depending on 
 ///   tempo/pitch/rate/samplerate settings.
-#define SETTING_NOMINAL_OUTPUT_SEQUENCE		7
+#define SETTING_NOMINAL_OUTPUT_SEQUENCE     7
+
 
-class EXPORT SoundTouch : public FIFOProcessor
+/// Call "getSetting" with this ID to query initial processing latency, i.e.
+/// approx. how many samples you'll need to enter to SoundTouch pipeline before 
+/// you can expect to get first batch of ready output samples out. 
+///
+/// After the first output batch, you can then expect to get approx. 
+/// SETTING_NOMINAL_OUTPUT_SEQUENCE ready samples out for every
+/// SETTING_NOMINAL_INPUT_SEQUENCE samples that you enter into SoundTouch.
+///
+/// Example:
+///     processing with parameter -tempo=5
+///     => initial latency = 5509 samples
+///        input sequence  = 4167 samples
+///        output sequence = 3969 samples
+///
+/// Accordingly, you can expect to feed in approx. 5509 samples at beginning of 
+/// the stream, and then you'll get out the first 3969 samples. After that, for 
+/// every approx. 4167 samples that you'll put in, you'll receive again approx. 
+/// 3969 samples out.
+///
+/// This also means that average latency during stream processing is 
+/// INITIAL_LATENCY-OUTPUT_SEQUENCE/2, in the above example case 5509-3969/2 
+/// = 3524 samples
+/// 
+/// Notices: 
+/// - This is read-only parameter, i.e. setSetting ignores this parameter
+/// - This parameter value is not constant but change depending on 
+///   tempo/pitch/rate/samplerate settings.
+#define SETTING_INITIAL_LATENCY             8
+
+
+class SOUNDTOUCH_API SoundTouch : public FIFOProcessor
 {
 private:
     /// Rate transposer class instance
@@ -151,17 +175,24 @@ private:
     class TDStretch *pTDStretch;
 
     /// Virtual pitch parameter. Effective rate & tempo are calculated from these parameters.
-    float virtualRate;
+    double virtualRate;
 
     /// Virtual pitch parameter. Effective rate & tempo are calculated from these parameters.
-    float virtualTempo;
+    double virtualTempo;
 
     /// Virtual pitch parameter. Effective rate & tempo are calculated from these parameters.
-    float virtualPitch;
+    double virtualPitch;
 
     /// Flag: Has sample rate been set?
     bool  bSrateSet;
 
+    /// Accumulator for how many samples in total will be expected as output vs. samples put in,
+    /// considering current processing settings.
+    double samplesExpectedOut;
+
+    /// Accumulator for how many samples in total have been read out from the processing so far
+    long   samplesOutput;
+
     /// Calculates effective rate & tempo valuescfrom 'virtualRate', 'virtualTempo' and 
     /// 'virtualPitch' parameters.
     void calcEffectiveRateAndTempo();
@@ -171,10 +202,10 @@ protected :
     uint  channels;
 
     /// Effective 'rate' value calculated from 'virtualRate', 'virtualTempo' and 'virtualPitch'
-    float rate;
+    double rate;
 
     /// Effective 'tempo' value calculated from 'virtualRate', 'virtualTempo' and 'virtualPitch'
-    float tempo;
+    double tempo;
 
 public:
     SoundTouch();
@@ -188,32 +219,32 @@ public:
 
     /// Sets new rate control value. Normal rate = 1.0, smaller values
     /// represent slower rate, larger faster rates.
-    void setRate(float newRate);
+    void setRate(double newRate);
 
     /// Sets new tempo control value. Normal tempo = 1.0, smaller values
     /// represent slower tempo, larger faster tempo.
-    void setTempo(float newTempo);
+    void setTempo(double newTempo);
 
     /// Sets new rate control value as a difference in percents compared
     /// to the original rate (-50 .. +100 %)
-    void setRateChange(float newRate);
+    void setRateChange(double newRate);
 
     /// Sets new tempo control value as a difference in percents compared
     /// to the original tempo (-50 .. +100 %)
-    void setTempoChange(float newTempo);
+    void setTempoChange(double newTempo);
 
     /// Sets new pitch control value. Original pitch = 1.0, smaller values
     /// represent lower pitches, larger values higher pitch.
-    void setPitch(float newPitch);
+    void setPitch(double newPitch);
 
     /// Sets pitch change in octaves compared to the original pitch  
     /// (-1.00 .. +1.00)
-    void setPitchOctaves(float newPitch);
+    void setPitchOctaves(double newPitch);
 
     /// Sets pitch change in semi-tones compared to the original pitch
     /// (-12 .. +12)
     void setPitchSemiTones(int newPitch);
-    void setPitchSemiTones(float newPitch);
+    void setPitchSemiTones(double newPitch);
 
     /// Sets the number of channels, 1 = mono, 2 = stereo
     void setChannels(uint numChannels);
@@ -221,6 +252,24 @@ public:
     /// Sets sample rate.
     void setSampleRate(uint srate);
 
+    /// Get ratio between input and output audio durations, useful for calculating
+    /// processed output duration: if you'll process a stream of N samples, then 
+    /// you can expect to get out N * getInputOutputSampleRatio() samples.
+    ///
+    /// This ratio will give accurate target duration ratio for a full audio track, 
+    /// given that the the whole track is processed with same processing parameters.
+    /// 
+    /// If this ratio is applied to calculate intermediate offsets inside a processing
+    /// stream, then this ratio is approximate and can deviate +- some tens of milliseconds 
+    /// from ideal offset, yet by end of the audio stream the duration ratio will become
+    /// exact.
+    ///
+    /// Example: if processing with parameters "-tempo=15 -pitch=-3", the function
+    /// will return value 0.8695652... Now, if processing an audio stream whose duration
+    /// is exactly one million audio samples, then you can expect the processed 
+    /// output duration  be 0.869565 * 1000000 = 869565 samples.
+    double getInputOutputSampleRatio();
+
     /// Flushes the last samples from the processing pipeline to the output.
     /// Clears also the internal processing buffers.
     //
@@ -240,6 +289,23 @@ public:
                                                     ///< contains data for both channels.
             );
 
+    /// Output samples from beginning of the sample buffer. Copies requested samples to 
+    /// output buffer and removes them from the sample buffer. If there are less than 
+    /// 'numsample' samples in the buffer, returns all that available.
+    ///
+    /// \return Number of samples returned.
+    virtual uint receiveSamples(SAMPLETYPE *output, ///< Buffer where to copy output samples.
+        uint maxSamples                 ///< How many samples to receive at max.
+        );
+
+    /// Adjusts book-keeping so that given number of samples are removed from beginning of the 
+    /// sample buffer without copying them anywhere. 
+    ///
+    /// Used to reduce the number of samples in the buffer when accessing the sample buffer directly
+    /// with 'ptrBegin' function.
+    virtual uint receiveSamples(uint maxSamples   ///< Remove this many samples from the beginning of pipe.
+        );
+
     /// Clears all the samples in the object's output and internal processing
     /// buffers.
     virtual void clear();
@@ -247,7 +313,7 @@ public:
     /// Changes a setting controlling the processing system behaviour. See the
     /// 'SETTING_...' defines for available setting ID's.
     /// 
-    /// \return 'true' if the setting was succesfully changed
+    /// \return 'true' if the setting was successfully changed
     bool setSetting(int settingId,   ///< Setting ID number. see SETTING_... defines.
                     int value        ///< New setting value.
                     );
@@ -262,6 +328,11 @@ public:
     /// Returns number of samples currently unprocessed.
     virtual uint numUnprocessedSamples() const;
 
+    /// Return number of channels
+    uint numChannels() const
+    {
+        return channels;
+    }
 
     /// Other handy functions that are implemented in the ancestor classes (see
     /// classes 'FIFOProcessor' and 'FIFOSamplePipe')
diff --git a/media/libsoundtouch/src/SoundTouchFactory.cpp b/media/libsoundtouch/src/SoundTouchFactory.cpp
index b577616fca..d33fc133d7 100644
--- a/media/libsoundtouch/src/SoundTouchFactory.cpp
+++ b/media/libsoundtouch/src/SoundTouchFactory.cpp
@@ -9,14 +9,14 @@
 namespace soundtouch
 {
 
-EXPORT
+SOUNDTOUCH_API
 soundtouch::SoundTouch*
 createSoundTouchObj()
 {
   return new soundtouch::SoundTouch();
 }
 
-EXPORT
+SOUNDTOUCH_API
 void
 destroySoundTouchObj(soundtouch::SoundTouch* aObj)
 {
diff --git a/media/libsoundtouch/src/SoundTouchFactory.h b/media/libsoundtouch/src/SoundTouchFactory.h
index cea4d9a649..98d8a7deb1 100644
--- a/media/libsoundtouch/src/SoundTouchFactory.h
+++ b/media/libsoundtouch/src/SoundTouchFactory.h
@@ -12,11 +12,11 @@
 
 namespace soundtouch
 {
-EXPORT
+SOUNDTOUCH_API
 soundtouch::SoundTouch*
 createSoundTouchObj();
 
-EXPORT
+SOUNDTOUCH_API
 void
 destroySoundTouchObj(soundtouch::SoundTouch* aObj);
 }
diff --git a/media/libsoundtouch/src/TDStretch.cpp b/media/libsoundtouch/src/TDStretch.cpp
index b955bfc961..709e979d1d 100644
--- a/media/libsoundtouch/src/TDStretch.cpp
+++ b/media/libsoundtouch/src/TDStretch.cpp
@@ -1,11 +1,17 @@
-////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
 /// 
 /// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo 
 /// while maintaining the original pitch by using a time domain WSOLA-like 
 /// method with several performance-increasing tweaks.
 ///
-/// Note : MMX optimized functions reside in a separate, platform-specific 
-/// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
+/// Notes : MMX optimized functions reside in a separate, platform-specific 
+/// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'.
+///
+/// This source file contains OpenMP optimizations that allow speeding up the
+/// corss-correlation algorithm by executing it in several threads / CPU cores 
+/// in parallel. See the following article link for more detailed discussion 
+/// about SoundTouch OpenMP optimizations:
+/// http://www.softwarecoven.com/parallel-computing-in-embedded-mobile-devices
 ///
 /// Author        : Copyright (c) Olli Parviainen
 /// Author e-mail : oparviai 'at' iki.fi
@@ -13,13 +19,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-02-22 15:07:12 +0000 (Sun, 22 Feb 2015) $
-// File revision : $Revision: 1.12 $
-//
-// $Id: TDStretch.cpp 205 2015-02-22 15:07:12Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -55,7 +54,6 @@ using namespace soundtouch;
 
 #define max(x, y) (((x) > (y)) ? (x) : (y))
 
-
 /*****************************************************************************
  *
  * Constant definitions
@@ -63,7 +61,7 @@ using namespace soundtouch;
  *****************************************************************************/
 
 // Table for the hierarchical mixing position seeking algorithm
-static const short _scanOffsets[5][24]={
+const short _scanOffsets[5][24]={
     { 124,  186,  248,  310,  372,  434,  496,  558,  620,  682,  744, 806,
       868,  930,  992, 1054, 1116, 1178, 1240, 1302, 1364, 1426, 1488,   0},
     {-100,  -75,  -50,  -25,   25,   50,   75,  100,    0,    0,    0,   0,
@@ -94,9 +92,6 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
     bAutoSeqSetting = true;
     bAutoSeekSetting = true;
 
-//    outDebt = 0;
-    skipFract = 0;
-
     tempo = 1.0f;
     setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
     setTempo(1.0f);
@@ -202,7 +197,7 @@ void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const
     m1 = (SAMPLETYPE)0;
     m2 = (SAMPLETYPE)overlapLength;
 
-    for (i = 0; i < overlapLength ; i ++) 
+    for (i = 0; i < overlapLength ; i ++)
     {
         pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength;
         m1 += 1;
@@ -222,6 +217,10 @@ void TDStretch::clearInput()
 {
     inputBuffer.clear();
     clearMidBuffer();
+    isBeginning = true;
+    maxnorm = 0;
+    maxnormf = 1e8;
+    skipFract = 0;
 }
 
 
@@ -255,7 +254,7 @@ int TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos)
     if (bQuickSeek) 
     {
         return seekBestOverlapPositionQuick(refPos);
-    } 
+    }
     else 
     {
         return seekBestOverlapPositionFull(refPos);
@@ -287,7 +286,6 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui
 }
 
 
-
 // Seeks for the optimal overlap-mixing position. The 'stereo' version of the
 // routine
 //
@@ -301,21 +299,23 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
     int i;
     double norm;
 
-    bestCorr = FLT_MIN;
+    bestCorr = -FLT_MAX;
     bestOffs = 0;
 
     // Scans for the best correlation value by testing each possible position
     // over the permitted range.
     bestCorr = calcCrossCorr(refPos, pMidBuffer, norm);
+    bestCorr = (bestCorr + 0.1) * 0.75;
 
     #pragma omp parallel for
-    for (i = 1; i < seekLength; i ++) 
+    for (i = 1; i < seekLength; i ++)
     {
         double corr;
         // Calculates correlation value for the mixing position corresponding to 'i'
-#ifdef _OPENMP
+#if defined(_OPENMP) || defined(ST_SIMD_AVOID_UNALIGNED)
         // in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't
         // iterate the loop in sequential order
+        // in SIMD mode, avoid accumulator version to allow avoiding unaligned positions
         corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm);
 #else
         // In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same
@@ -341,6 +341,11 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
             }
         }
     }
+
+#ifdef SOUNDTOUCH_INTEGER_SAMPLES
+    adaptNormalizer();
+#endif
+
     // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
     clearCrossCorrState();
 
@@ -348,64 +353,157 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
 }
 
 
-// Seeks for the optimal overlap-mixing position. The 'stereo' version of the
-// routine
+// Quick seek algorithm for improved runtime-performance: First roughly scans through the 
+// correlation area, and then scan surroundings of two best preliminary correlation candidates
+// with improved precision
 //
-// The best position is determined as the position where the two overlapped
-// sample sequences are 'most alike', in terms of the highest cross-correlation
-// value over the overlapping period
-int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos) 
+// Based on testing:
+// - This algorithm gives on average 99% as good match as the full algorithm
+// - this quick seek algorithm finds the best match on ~90% of cases
+// - on those 10% of cases when this algorithm doesn't find best match, 
+//   it still finds on average ~90% match vs. the best possible match
+int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos)
 {
-    int j;
+#define _MIN(a, b)   (((a) < (b)) ? (a) : (b))
+#define SCANSTEP    16
+#define SCANWIND    8
+
     int bestOffs;
-    double bestCorr, corr;
-    int scanCount, corrOffset, tempOffset;
+    int i;
+    int bestOffs2;
+    float bestCorr, corr;
+    float bestCorr2;
+    double norm;
+
+    // note: 'float' types used in this function in case that the platform would need to use software-fp
 
-    bestCorr = FLT_MIN;
-    bestOffs = _scanOffsets[0][0];
-    corrOffset = 0;
-    tempOffset = 0;
+    bestCorr =
+    bestCorr2 = -FLT_MAX;
+    bestOffs = 
+    bestOffs2 = SCANWIND;
 
-    // Scans for the best correlation value using four-pass hierarchical search.
+    // Scans for the best correlation value by testing each possible position
+    // over the permitted range. Look for two best matches on the first pass to
+    // increase possibility of ideal match.
     //
-    // The look-up table 'scans' has hierarchical position adjusting steps.
-    // In first pass the routine searhes for the highest correlation with 
-    // relatively coarse steps, then rescans the neighbourhood of the highest
-    // correlation with better resolution and so on.
-    for (scanCount = 0;scanCount < 4; scanCount ++) 
-    {
-        j = 0;
-        while (_scanOffsets[scanCount][j]) 
+    // Begin from "SCANSTEP" instead of SCANWIND to make the calculation
+    // catch the 'middlepoint' of seekLength vector as that's the a-priori 
+    // expected best match position
+    //
+    // Roughly:
+    // - 15% of cases find best result directly on the first round,
+    // - 75% cases find better match on 2nd round around the best match from 1st round
+    // - 10% cases find better match on 2nd round around the 2nd-best-match from 1st round
+    for (i = SCANSTEP; i < seekLength - SCANWIND - 1; i += SCANSTEP)
+    {
+        // Calculates correlation value for the mixing position corresponding
+        // to 'i'
+        corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
+        // heuristic rule to slightly favour values close to mid of the seek range
+        float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
+        corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
+
+        // Checks for the highest correlation value
+        if (corr > bestCorr)
         {
-            double norm;
-            tempOffset = corrOffset + _scanOffsets[scanCount][j];
-            if (tempOffset >= seekLength) break;
-
-            // Calculates correlation value for the mixing position corresponding
-            // to 'tempOffset'
-            corr = (double)calcCrossCorr(refPos + channels * tempOffset, pMidBuffer, norm);
-            // heuristic rule to slightly favour values close to mid of the range
-            double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
-            corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
-
-            // Checks for the highest correlation value
-            if (corr > bestCorr) 
-            {
-                bestCorr = corr;
-                bestOffs = tempOffset;
-            }
-            j ++;
+            // found new best match. keep the previous best as 2nd best match
+            bestCorr2 = bestCorr;
+            bestOffs2 = bestOffs;
+            bestCorr = corr;
+            bestOffs = i;
+        }
+        else if (corr > bestCorr2)
+        {
+            // not new best, but still new 2nd best match
+            bestCorr2 = corr;
+            bestOffs2 = i;
+        }
+    }
+
+    // Scans surroundings of the found best match with small stepping
+    int end = _MIN(bestOffs + SCANWIND + 1, seekLength);
+    for (i = bestOffs - SCANWIND; i < end; i++)
+    {
+        if (i == bestOffs) continue;    // this offset already calculated, thus skip
+
+        // Calculates correlation value for the mixing position corresponding
+        // to 'i'
+        corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
+        // heuristic rule to slightly favour values close to mid of the range
+        float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
+        corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
+
+        // Checks for the highest correlation value
+        if (corr > bestCorr)
+        {
+            bestCorr = corr;
+            bestOffs = i;
+        }
+    }
+
+    // Scans surroundings of the 2nd best match with small stepping
+    end = _MIN(bestOffs2 + SCANWIND + 1, seekLength);
+    for (i = bestOffs2 - SCANWIND; i < end; i++)
+    {
+        if (i == bestOffs2) continue;    // this offset already calculated, thus skip
+
+        // Calculates correlation value for the mixing position corresponding
+        // to 'i'
+        corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
+        // heuristic rule to slightly favour values close to mid of the range
+        float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
+        corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
+
+        // Checks for the highest correlation value
+        if (corr > bestCorr)
+        {
+            bestCorr = corr;
+            bestOffs = i;
         }
-        corrOffset = bestOffs;
     }
+
     // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
     clearCrossCorrState();
 
+#ifdef SOUNDTOUCH_INTEGER_SAMPLES
+    adaptNormalizer();
+#endif
+
     return bestOffs;
 }
 
 
 
+
+/// For integer algorithm: adapt normalization factor divider with music so that 
+/// it'll not be pessimistically restrictive that can degrade quality on quieter sections
+/// yet won't cause integer overflows either
+void TDStretch::adaptNormalizer()
+{
+    // Do not adapt normalizer over too silent sequences to avoid averaging filter depleting to
+    // too low values during pauses in music
+    if ((maxnorm > 1000) || (maxnormf > 40000000))
+    { 
+        //norm averaging filter
+        maxnormf = 0.9f * maxnormf + 0.1f * (float)maxnorm;
+
+        if ((maxnorm > 800000000) && (overlapDividerBitsNorm < 16))
+        {
+            // large values, so increase divider
+            overlapDividerBitsNorm++;
+            if (maxnorm > 1600000000) overlapDividerBitsNorm++; // extra large value => extra increase
+        }
+        else if ((maxnormf < 1000000) && (overlapDividerBitsNorm > 0))
+        {
+            // extra small values, decrease divider
+            overlapDividerBitsNorm--;
+        }
+    }
+
+    maxnorm = 0;
+}
+
+
 /// clear cross correlation routine state if necessary 
 void TDStretch::clearCrossCorrState()
 {
@@ -417,18 +515,18 @@ void TDStretch::clearCrossCorrState()
 void TDStretch::calcSeqParameters()
 {
     // Adjust tempo param according to tempo, so that variating processing sequence length is used
-    // at varius tempo settings, between the given low...top limits
+    // at various tempo settings, between the given low...top limits
     #define AUTOSEQ_TEMPO_LOW   0.5     // auto setting low tempo range (-50%)
     #define AUTOSEQ_TEMPO_TOP   2.0     // auto setting top tempo range (+100%)
 
     // sequence-ms setting values at above low & top tempo
-    #define AUTOSEQ_AT_MIN      125.0
-    #define AUTOSEQ_AT_MAX      50.0
+    #define AUTOSEQ_AT_MIN      90.0
+    #define AUTOSEQ_AT_MAX      40.0
     #define AUTOSEQ_K           ((AUTOSEQ_AT_MAX - AUTOSEQ_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
     #define AUTOSEQ_C           (AUTOSEQ_AT_MIN - (AUTOSEQ_K) * (AUTOSEQ_TEMPO_LOW))
 
-    // seek-window-ms setting values at above low & top tempo
-    #define AUTOSEEK_AT_MIN     25.0
+    // seek-window-ms setting values at above low & top tempoq
+    #define AUTOSEEK_AT_MIN     20.0
     #define AUTOSEEK_AT_MAX     15.0
     #define AUTOSEEK_K          ((AUTOSEEK_AT_MAX - AUTOSEEK_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
     #define AUTOSEEK_C          (AUTOSEEK_AT_MIN - (AUTOSEEK_K) * (AUTOSEQ_TEMPO_LOW))
@@ -464,7 +562,7 @@ void TDStretch::calcSeqParameters()
 
 // Sets new target tempo. Normal tempo = 'SCALE', smaller values represent slower 
 // tempo, larger faster tempo.
-void TDStretch::setTempo(float newTempo)
+void TDStretch::setTempo(double newTempo)
 {
     int intskip;
 
@@ -475,7 +573,7 @@ void TDStretch::setTempo(float newTempo)
 
     // Calculate ideal skip length (according to tempo value) 
     nominalSkip = tempo * (seekWindowLength - overlapLength);
-    intskip = (int)(nominalSkip + 0.5f);
+    intskip = (int)(nominalSkip + 0.5);
 
     // Calculate how many samples are needed in the 'inputBuffer' to 
     // process another batch of samples
@@ -488,9 +586,8 @@ void TDStretch::setTempo(float newTempo)
 // Sets the number of channels, 1 = mono, 2 = stereo
 void TDStretch::setChannels(int numChannels)
 {
-    assert(numChannels > 0);
-    if (channels == numChannels) return;
-//    assert(numChannels == 1 || numChannels == 2);
+    if (!verifyNumberOfChannels(numChannels) ||
+        (channels == numChannels)) return;
 
     channels = numChannels;
     inputBuffer.setChannels(channels);
@@ -539,7 +636,8 @@ void TDStretch::processNominalTempo()
 // the result into 'outputBuffer'
 void TDStretch::processSamples()
 {
-    int ovlSkip, offset;
+    int ovlSkip;
+    int offset = 0;
     int temp;
 
     /* Removed this small optimization - can introduce a click to sound when tempo setting
@@ -556,35 +654,62 @@ void TDStretch::processSamples()
     // to form a processing frame.
     while ((int)inputBuffer.numSamples() >= sampleReq) 
     {
-        // If tempo differs from the normal ('SCALE'), scan for the best overlapping
-        // position
-        offset = seekBestOverlapPosition(inputBuffer.ptrBegin());
-
-        // Mix the samples in the 'inputBuffer' at position of 'offset' with the 
-        // samples in 'midBuffer' using sliding overlapping
-        // ... first partially overlap with the end of the previous sequence
-        // (that's in 'midBuffer')
-        overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
-        outputBuffer.putSamples((uint)overlapLength);
+        if (isBeginning == false)
+        {
+            // apart from the very beginning of the track, 
+            // scan for the best overlapping position & do overlap-add
+            offset = seekBestOverlapPosition(inputBuffer.ptrBegin());
+
+            // Mix the samples in the 'inputBuffer' at position of 'offset' with the 
+            // samples in 'midBuffer' using sliding overlapping
+            // ... first partially overlap with the end of the previous sequence
+            // (that's in 'midBuffer')
+            overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
+            outputBuffer.putSamples((uint)overlapLength);
+            offset += overlapLength;
+        }
+        else
+        {
+            // Adjust processing offset at beginning of track by not perform initial overlapping
+            // and compensating that in the 'input buffer skip' calculation
+            isBeginning = false;
+            int skip = (int)(tempo * overlapLength + 0.5 * seekLength + 0.5);
+
+            #ifdef ST_SIMD_AVOID_UNALIGNED
+            // in SIMD mode, round the skip amount to value corresponding to aligned memory address
+            if (channels == 1)
+            {
+                skip &= -4;
+            }
+            else if (channels == 2)
+            {
+                skip &= -2;
+            }
+            #endif
+            skipFract -= skip;
+            if (skipFract <= -nominalSkip)
+            {
+                skipFract = -nominalSkip;
+            }
+        }
 
         // ... then copy sequence samples from 'inputBuffer' to output:
 
-        // length of sequence
-        temp = (seekWindowLength - 2 * overlapLength);
-
         // crosscheck that we don't have buffer overflow...
-        if ((int)inputBuffer.numSamples() < (offset + temp + overlapLength * 2))
+        if ((int)inputBuffer.numSamples() < (offset + seekWindowLength - overlapLength))
         {
             continue;    // just in case, shouldn't really happen
         }
 
-        outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp);
+        // length of sequence
+        temp = (seekWindowLength - 2 * overlapLength);
+        outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * offset, (uint)temp);
 
         // Copies the end of the current sequence from 'inputBuffer' to 
         // 'midBuffer' for being mixed with the beginning of the next 
         // processing sequence and so on
-        assert((offset + temp + overlapLength * 2) <= (int)inputBuffer.numSamples());
-        memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp + overlapLength), 
+        assert((offset + temp + overlapLength) <= (int)inputBuffer.numSamples());
+        memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp), 
             channels * sizeof(SAMPLETYPE) * overlapLength);
 
         // Remove the processed samples from the input buffer. Update
@@ -680,7 +805,7 @@ TDStretch * TDStretch::newInstance()
 
 //////////////////////////////////////////////////////////////////////////////
 //
-// Integer arithmetics specific algorithm implementations.
+// Integer arithmetic specific algorithm implementations.
 //
 //////////////////////////////////////////////////////////////////////////////
 
@@ -694,7 +819,7 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const
     short temp;
     int cnt2;
 
-    for (i = 0; i < overlapLength ; i ++) 
+    for (i = 0; i < overlapLength ; i ++)
     {
         temp = (short)(overlapLength - i);
         cnt2 = 2 * i;
@@ -706,21 +831,19 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const
 
 // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Multi'
 // version of the routine.
-void TDStretch::overlapMulti(SAMPLETYPE *poutput, const SAMPLETYPE *input) const
+void TDStretch::overlapMulti(short *poutput, const short *input) const
 {
-    SAMPLETYPE m1=(SAMPLETYPE)0;
-    SAMPLETYPE m2;
-    int i=0;
+    short m1;
+    int i = 0;
 
-    for (m2 = (SAMPLETYPE)overlapLength; m2; m2 --)
+    for (m1 = 0; m1 < overlapLength; m1 ++)
     {
+        short m2 = (short)(overlapLength - m1);
         for (int c = 0; c < channels; c ++)
         {
             poutput[i] = (input[i] * m1 + pMidBuffer[i] * m2)  / overlapLength;
             i++;
         }
-
-        m1++;
     }
 }
 
@@ -743,13 +866,15 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
     // calculate overlap length so that it's power of 2 - thus it's easy to do
     // integer division by right-shifting. Term "-1" at end is to account for 
     // the extra most significatnt bit left unused in result by signed multiplication 
-    overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
-    if (overlapDividerBits > 9) overlapDividerBits = 9;
-    if (overlapDividerBits < 3) overlapDividerBits = 3;
-    newOvl = (int)pow(2.0, (int)overlapDividerBits + 1);    // +1 => account for -1 above
+    overlapDividerBitsPure = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
+    if (overlapDividerBitsPure > 9) overlapDividerBitsPure = 9;
+    if (overlapDividerBitsPure < 3) overlapDividerBitsPure = 3;
+    newOvl = (int)pow(2.0, (int)overlapDividerBitsPure + 1);    // +1 => account for -1 above
 
     acceptNewOverlapLength(newOvl);
 
+    overlapDividerBitsNorm = overlapDividerBitsPure;
+
     // calculate sloping divider so that crosscorrelation operation won't 
     // overflow 32-bit register. Max. sum of the crosscorrelation sum without 
     // divider would be 2^30*(N^3-N)/3, where N = overlap length
@@ -757,28 +882,40 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
 }
 
 
-double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, double &norm) const
+double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, double &norm)
 {
     long corr;
-    long lnorm;
+    unsigned long lnorm;
     int i;
 
+    #ifdef ST_SIMD_AVOID_UNALIGNED
+        // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
+        if (((ulongptr)mixingPos) & 15) return -1e50;
+    #endif
+
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
+
     corr = lnorm = 0;
-    // Same routine for stereo and mono. For stereo, unroll loop for better
-    // efficiency and gives slightly better resolution against rounding. 
-    // For mono it same routine, just  unrolls loop by factor of 4
-    for (i = 0; i < channels * overlapLength; i += 4) 
+    // Same routine for stereo and mono
+    for (i = 0; i < ilength; i += 2)
     {
         corr += (mixingPos[i] * compare[i] + 
-                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits;  // notice: do intermediate division here to avoid integer overflow
-        corr += (mixingPos[i + 2] * compare[i + 2] + 
-                 mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits;
+                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
         lnorm += (mixingPos[i] * mixingPos[i] + 
-                  mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow
-        lnorm += (mixingPos[i + 2] * mixingPos[i + 2] + 
-                  mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBits;
+                  mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm;
+        // do intermediate scalings to avoid integer overflow
     }
 
+    if (lnorm > maxnorm)
+    {
+        // modify 'maxnorm' inside critical section to avoid multi-access conflict if in OpenMP mode
+        #pragma omp critical
+        if (lnorm > maxnorm)
+        {
+            maxnorm = lnorm;
+        }
+    }
     // Normalize result by dividing by sqrt(norm) - this step is easiest 
     // done using floating point operation
     norm = (double)lnorm;
@@ -787,38 +924,42 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
 
 
 /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
-double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm) const
+double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm)
 {
     long corr;
     long lnorm;
     int i;
 
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
+
     // cancel first normalizer tap from previous round
     lnorm = 0;
     for (i = 1; i <= channels; i ++)
     {
-        lnorm -= (mixingPos[-i] * mixingPos[-i]) >> overlapDividerBits;
+        lnorm -= (mixingPos[-i] * mixingPos[-i]) >> overlapDividerBitsNorm;
     }
 
     corr = 0;
-    // Same routine for stereo and mono. For stereo, unroll loop for better
-    // efficiency and gives slightly better resolution against rounding. 
-    // For mono it same routine, just  unrolls loop by factor of 4
-    for (i = 0; i < channels * overlapLength; i += 4) 
+    // Same routine for stereo and mono.
+    for (i = 0; i < ilength; i += 2) 
     {
         corr += (mixingPos[i] * compare[i] + 
-                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits;  // notice: do intermediate division here to avoid integer overflow
-        corr += (mixingPos[i + 2] * compare[i + 2] + 
-                 mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits;
+                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
     }
 
     // update normalizer with last samples of this round
     for (int j = 0; j < channels; j ++)
     {
         i --;
-        lnorm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBits;
+        lnorm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBitsNorm;
     }
+
     norm += (double)lnorm;
+    if (norm > maxnorm)
+    {
+        maxnorm = (unsigned long)norm;
+    }
 
     // Normalize result by dividing by sqrt(norm) - this step is easiest 
     // done using floating point operation
@@ -829,7 +970,7 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c
 
 //////////////////////////////////////////////////////////////////////////////
 //
-// Floating point arithmetics specific algorithm implementations.
+// Floating point arithmetic specific algorithm implementations.
 //
 
 #ifdef SOUNDTOUCH_FLOAT_SAMPLES
@@ -903,29 +1044,26 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
 
 
 /// Calculate cross-correlation
-double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm) const
+double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm)
 {
-    double corr;
-    double norm;
+    float corr;
+    float norm;
     int i;
 
-    corr = norm = 0;
-    // Same routine for stereo and mono. For Stereo, unroll by factor of 2.
-    // For mono it's same routine yet unrollsd by factor of 4.
-    for (i = 0; i < channels * overlapLength; i += 4) 
-    {
-        corr += mixingPos[i] * compare[i] +
-                mixingPos[i + 1] * compare[i + 1];
-
-        norm += mixingPos[i] * mixingPos[i] + 
-                mixingPos[i + 1] * mixingPos[i + 1];
+    #ifdef ST_SIMD_AVOID_UNALIGNED
+        // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
+        if (((ulongptr)mixingPos) & 15) return -1e50;
+    #endif
 
-        // unroll the loop for better CPU efficiency:
-        corr += mixingPos[i + 2] * compare[i + 2] +
-                mixingPos[i + 3] * compare[i + 3];
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
 
-        norm += mixingPos[i + 2] * mixingPos[i + 2] +
-                mixingPos[i + 3] * mixingPos[i + 3];
+    corr = norm = 0;
+    // Same routine for stereo and mono
+    for (i = 0; i < ilength; i ++)
+    {
+        corr += mixingPos[i] * compare[i];
+        norm += mixingPos[i] * mixingPos[i];
     }
 
     anorm = norm;
@@ -934,9 +1072,9 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do
 
 
 /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
-double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm) const
+double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm)
 {
-    double corr;
+    float corr;
     int i;
 
     corr = 0;
@@ -947,14 +1085,13 @@ double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *c
         norm -= mixingPos[-i] * mixingPos[-i];
     }
 
-    // Same routine for stereo and mono. For Stereo, unroll by factor of 2.
-    // For mono it's same routine yet unrollsd by factor of 4.
-    for (i = 0; i < channels * overlapLength; i += 4) 
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
+
+    // Same routine for stereo and mono
+    for (i = 0; i < ilength; i ++)
     {
-        corr += mixingPos[i] * compare[i] +
-                mixingPos[i + 1] * compare[i + 1] +
-                mixingPos[i + 2] * compare[i + 2] +
-                mixingPos[i + 3] * compare[i + 3];
+        corr += mixingPos[i] * compare[i];
     }
 
     // update normalizer with last samples of this round
diff --git a/media/libsoundtouch/src/TDStretch.h b/media/libsoundtouch/src/TDStretch.h
index b390736913..3ef79c7cdf 100644
--- a/media/libsoundtouch/src/TDStretch.h
+++ b/media/libsoundtouch/src/TDStretch.h
@@ -13,13 +13,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2014-04-06 15:57:21 +0000 (Sun, 06 Apr 2014) $
-// File revision : $Revision: 4 $
-//
-// $Id: TDStretch.h 195 2014-04-06 15:57:21Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -112,39 +105,47 @@ class TDStretch : public FIFOProcessor
 protected:
     int channels;
     int sampleReq;
-    float tempo;
 
-    SAMPLETYPE *pMidBuffer;
-    SAMPLETYPE *pMidBufferUnaligned;
     int overlapLength;
     int seekLength;
     int seekWindowLength;
-    int overlapDividerBits;
+    int overlapDividerBitsNorm;
+    int overlapDividerBitsPure;
     int slopingDivider;
-    float nominalSkip;
-    float skipFract;
-    FIFOSampleBuffer outputBuffer;
-    FIFOSampleBuffer inputBuffer;
-    bool bQuickSeek;
-
     int sampleRate;
     int sequenceMs;
     int seekWindowMs;
     int overlapMs;
+
+    unsigned long maxnorm;
+    float maxnormf;
+
+    double tempo;
+    double nominalSkip;
+    double skipFract;
+
+    bool bQuickSeek;
     bool bAutoSeqSetting;
     bool bAutoSeekSetting;
+    bool isBeginning;
+
+    SAMPLETYPE *pMidBuffer;
+    SAMPLETYPE *pMidBufferUnaligned;
+
+    FIFOSampleBuffer outputBuffer;
+    FIFOSampleBuffer inputBuffer;
 
     void acceptNewOverlapLength(int newOverlapLength);
 
     virtual void clearCrossCorrState();
     void calculateOverlapLength(int overlapMs);
 
-    virtual double calcCrossCorr(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare, double &norm) const;
-    virtual double calcCrossCorrAccumulate(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare, double &norm) const;
+    virtual double calcCrossCorr(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare, double &norm);
+    virtual double calcCrossCorrAccumulate(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare, double &norm);
 
     virtual int seekBestOverlapPositionFull(const SAMPLETYPE *refPos);
     virtual int seekBestOverlapPositionQuick(const SAMPLETYPE *refPos);
-    int seekBestOverlapPosition(const SAMPLETYPE *refPos);
+    virtual int seekBestOverlapPosition(const SAMPLETYPE *refPos);
 
     virtual void overlapStereo(SAMPLETYPE *output, const SAMPLETYPE *input) const;
     virtual void overlapMono(SAMPLETYPE *output, const SAMPLETYPE *input) const;
@@ -154,6 +155,7 @@ protected:
     void overlap(SAMPLETYPE *output, const SAMPLETYPE *input, uint ovlPos) const;
 
     void calcSeqParameters();
+    void adaptNormalizer();
 
     /// Changes the tempo of the given sound samples.
     /// Returns amount of samples returned in the "output" buffer.
@@ -182,7 +184,7 @@ public:
 
     /// Sets new target tempo. Normal tempo = 'SCALE', smaller values represent slower 
     /// tempo, larger faster tempo.
-    void setTempo(float newTempo);
+    void setTempo(double newTempo);
 
     /// Returns nonzero if there aren't any samples available for outputting.
     virtual void clear();
@@ -238,8 +240,13 @@ public:
     {
         return seekWindowLength - overlapLength;
     }
-};
 
+	/// return approximate initial input-output latency
+	int getLatency() const
+	{
+		return sampleReq;
+	}
+};
 
 
 // Implementation-specific class declarations:
@@ -249,8 +256,8 @@ public:
     class TDStretchMMX : public TDStretch
     {
     protected:
-        double calcCrossCorr(const short *mixingPos, const short *compare, double &norm) const;
-        double calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm) const;
+        double calcCrossCorr(const short *mixingPos, const short *compare, double &norm);
+        double calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm);
         virtual void overlapStereo(short *output, const short *input) const;
         virtual void clearCrossCorrState();
     };
@@ -262,8 +269,8 @@ public:
     class TDStretchSSE : public TDStretch
     {
     protected:
-        double calcCrossCorr(const float *mixingPos, const float *compare, double &norm) const;
-        double calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm) const;
+        double calcCrossCorr(const float *mixingPos, const float *compare, double &norm);
+        double calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm);
     };
 
 #endif /// SOUNDTOUCH_ALLOW_SSE
diff --git a/media/libsoundtouch/src/cpu_detect.h b/media/libsoundtouch/src/cpu_detect.h
index e7582e8da3..093b6097aa 100644
--- a/media/libsoundtouch/src/cpu_detect.h
+++ b/media/libsoundtouch/src/cpu_detect.h
@@ -12,13 +12,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2008-02-10 16:26:55 +0000 (Sun, 10 Feb 2008) $
-// File revision : $Revision: 4 $
-//
-// $Id: cpu_detect.h 11 2008-02-10 16:26:55Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
diff --git a/media/libsoundtouch/src/cpu_detect_x86.cpp b/media/libsoundtouch/src/cpu_detect_x86.cpp
index 63f860664a..f0f026b20d 100644
--- a/media/libsoundtouch/src/cpu_detect_x86.cpp
+++ b/media/libsoundtouch/src/cpu_detect_x86.cpp
@@ -11,13 +11,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2014-01-07 18:24:28 +0000 (Tue, 07 Jan 2014) $
-// File revision : $Revision: 4 $
-//
-// $Id: cpu_detect_x86.cpp 183 2014-01-07 18:24:28Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -74,7 +67,6 @@ void disableExtensions(uint dwDisableMask)
 }
 
 
-
 /// Checks which instruction set extensions are supported by the CPU.
 uint detectCPUextensions(void)
 {
diff --git a/media/libsoundtouch/src/mmx_optimized.cpp b/media/libsoundtouch/src/mmx_optimized.cpp
index 9062026399..0bc7fe86f7 100644
--- a/media/libsoundtouch/src/mmx_optimized.cpp
+++ b/media/libsoundtouch/src/mmx_optimized.cpp
@@ -20,13 +20,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-02-22 15:10:38 +0000 (Sun, 22 Feb 2015) $
-// File revision : $Revision: 4 $
-//
-// $Id: mmx_optimized.cpp 206 2015-02-22 15:10:38Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -68,7 +61,7 @@ using namespace soundtouch;
 
 
 // Calculates cross correlation of two buffers
-double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm) const
+double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm)
 {
     const __m64 *pVec1, *pVec2;
     __m64 shifter;
@@ -79,7 +72,7 @@ double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &d
     pVec1 = (__m64*)pV1;
     pVec2 = (__m64*)pV2;
 
-    shifter = _m_from_int(overlapDividerBits);
+    shifter = _m_from_int(overlapDividerBitsNorm);
     normaccu = accu = _mm_setzero_si64();
 
     // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples 
@@ -123,6 +116,16 @@ double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &d
     // Clear MMS state
     _m_empty();
 
+    if (norm > (long)maxnorm)
+    {
+        // modify 'maxnorm' inside critical section to avoid multi-access conflict if in OpenMP mode
+        #pragma omp critical
+        if (norm > (long)maxnorm)
+        {
+            maxnorm = norm;
+        }
+    }
+
     // Normalize result by dividing by sqrt(norm) - this step is easiest 
     // done using floating point operation
     dnorm = (double)norm;
@@ -134,7 +137,7 @@ double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &d
 
 
 /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
-double TDStretchMMX::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm) const
+double TDStretchMMX::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm)
 {
     const __m64 *pVec1, *pVec2;
     __m64 shifter;
@@ -146,13 +149,13 @@ double TDStretchMMX::calcCrossCorrAccumulate(const short *pV1, const short *pV2,
     lnorm = 0;
     for (i = 1; i <= channels; i ++)
     {
-        lnorm -= (pV1[-i] * pV1[-i]) >> overlapDividerBits;
+        lnorm -= (pV1[-i] * pV1[-i]) >> overlapDividerBitsNorm;
     }
 
     pVec1 = (__m64*)pV1;
     pVec2 = (__m64*)pV2;
 
-    shifter = _m_from_int(overlapDividerBits);
+    shifter = _m_from_int(overlapDividerBitsNorm);
     accu = _mm_setzero_si64();
 
     // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples 
@@ -191,10 +194,15 @@ double TDStretchMMX::calcCrossCorrAccumulate(const short *pV1, const short *pV2,
     pV1 = (short *)pVec1;
     for (int j = 1; j <= channels; j ++)
     {
-        lnorm += (pV1[-j] * pV1[-j]) >> overlapDividerBits;
+        lnorm += (pV1[-j] * pV1[-j]) >> overlapDividerBitsNorm;
     }
     dnorm += (double)lnorm;
 
+    if (lnorm > (long)maxnorm)
+    {
+        maxnorm = lnorm;
+    }
+
     // Normalize result by dividing by sqrt(norm) - this step is easiest 
     // done using floating point operation
     return (double)corr / sqrt((dnorm < 1e-9) ? 1.0 : dnorm);
@@ -209,7 +217,6 @@ void TDStretchMMX::clearCrossCorrState()
 }
 
 
-
 // MMX-optimized version of the function overlapStereo
 void TDStretchMMX::overlapStereo(short *output, const short *input) const
 {
@@ -233,7 +240,7 @@ void TDStretchMMX::overlapStereo(short *output, const short *input) const
 
     // Overlaplength-division by shifter. "+1" is to account for "-1" deduced in
     // overlapDividerBits calculation earlier.
-    shifter = _m_from_int(overlapDividerBits + 1);
+    shifter = _m_from_int(overlapDividerBitsPure + 1);
 
     for (i = 0; i < overlapLength / 4; i ++)
     {
@@ -325,7 +332,6 @@ void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uRe
 }
 
 
-
 // mmx-optimized version of the filter routine for stereo sound
 uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
 {
@@ -382,4 +388,9 @@ uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, uint numS
     return (numSamples & 0xfffffffe) - length;
 }
 
+#else
+
+// workaround to not complain about empty module
+bool _dontcomplain_mmx_empty;
+
 #endif  // SOUNDTOUCH_ALLOW_MMX
diff --git a/media/libsoundtouch/src/moz.build b/media/libsoundtouch/src/moz.build
index a4294864f9..0912889f6f 100644
--- a/media/libsoundtouch/src/moz.build
+++ b/media/libsoundtouch/src/moz.build
@@ -1,4 +1,5 @@
 # -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
@@ -41,6 +42,8 @@ else:
     # Windows need alloca renamed to _alloca
     DEFINES['alloca'] = '_alloca'
 
+DEFINES['BUILDING_SOUNDTOUCH'] = 1
+
 # We allow warnings for third-party code that can be updated from upstream.
 ALLOW_COMPILER_WARNINGS = True
 
diff --git a/media/libsoundtouch/src/sse_optimized.cpp b/media/libsoundtouch/src/sse_optimized.cpp
index fa622efa55..9c16ea8f89 100644
--- a/media/libsoundtouch/src/sse_optimized.cpp
+++ b/media/libsoundtouch/src/sse_optimized.cpp
@@ -23,13 +23,6 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2015-02-21 21:24:29 +0000 (Sat, 21 Feb 2015) $
-// File revision : $Revision: 4 $
-//
-// $Id: sse_optimized.cpp 202 2015-02-21 21:24:29Z oparviai $
-//
-////////////////////////////////////////////////////////////////////////////////
-//
 // License :
 //
 //  SoundTouch audio processing library
@@ -71,7 +64,7 @@ using namespace soundtouch;
 #include <math.h>
 
 // Calculates cross correlation of two buffers
-double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &anorm) const
+double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &anorm)
 {
     int i;
     const float *pVec1;
@@ -87,7 +80,7 @@ double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &a
     // Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
     // for choosing if this little cheating is allowed.
 
-#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
+#ifdef ST_SIMD_AVOID_UNALIGNED
     // Little cheating allowed, return valid correlation only for 
     // aligned locations, meaning every second round for stereo sound.
 
@@ -183,7 +176,7 @@ double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &a
 
 
 
-double TDStretchSSE::calcCrossCorrAccumulate(const float *pV1, const float *pV2, double &norm) const
+double TDStretchSSE::calcCrossCorrAccumulate(const float *pV1, const float *pV2, double &norm)
 {
     // call usual calcCrossCorr function because SSE does not show big benefit of 
     // accumulating "norm" value, and also the "norm" rolling algorithm would get