diff options
Diffstat (limited to 'media/libsoundtouch/src/TDStretch.cpp')
-rw-r--r-- | media/libsoundtouch/src/TDStretch.cpp | 443 |
1 files changed, 290 insertions, 153 deletions
diff --git a/media/libsoundtouch/src/TDStretch.cpp b/media/libsoundtouch/src/TDStretch.cpp index b955bfc961..709e979d1d 100644 --- a/media/libsoundtouch/src/TDStretch.cpp +++ b/media/libsoundtouch/src/TDStretch.cpp @@ -1,11 +1,17 @@ -//////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// /// /// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo /// while maintaining the original pitch by using a time domain WSOLA-like /// method with several performance-increasing tweaks. /// -/// Note : MMX optimized functions reside in a separate, platform-specific -/// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp' +/// Notes : MMX optimized functions reside in a separate, platform-specific +/// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'. +/// +/// This source file contains OpenMP optimizations that allow speeding up the +/// corss-correlation algorithm by executing it in several threads / CPU cores +/// in parallel. See the following article link for more detailed discussion +/// about SoundTouch OpenMP optimizations: +/// http://www.softwarecoven.com/parallel-computing-in-embedded-mobile-devices /// /// Author : Copyright (c) Olli Parviainen /// Author e-mail : oparviai 'at' iki.fi @@ -13,13 +19,6 @@ /// //////////////////////////////////////////////////////////////////////////////// // -// Last changed : $Date: 2015-02-22 15:07:12 +0000 (Sun, 22 Feb 2015) $ -// File revision : $Revision: 1.12 $ -// -// $Id: TDStretch.cpp 205 2015-02-22 15:07:12Z oparviai $ -// -//////////////////////////////////////////////////////////////////////////////// -// // License : // // SoundTouch audio processing library @@ -55,7 +54,6 @@ using namespace soundtouch; #define max(x, y) (((x) > (y)) ? (x) : (y)) - /***************************************************************************** * * Constant definitions @@ -63,7 +61,7 @@ using namespace soundtouch; *****************************************************************************/ // Table for the hierarchical mixing position seeking algorithm -static const short _scanOffsets[5][24]={ +const short _scanOffsets[5][24]={ { 124, 186, 248, 310, 372, 434, 496, 558, 620, 682, 744, 806, 868, 930, 992, 1054, 1116, 1178, 1240, 1302, 1364, 1426, 1488, 0}, {-100, -75, -50, -25, 25, 50, 75, 100, 0, 0, 0, 0, @@ -94,9 +92,6 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer) bAutoSeqSetting = true; bAutoSeekSetting = true; -// outDebt = 0; - skipFract = 0; - tempo = 1.0f; setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS); setTempo(1.0f); @@ -202,7 +197,7 @@ void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const m1 = (SAMPLETYPE)0; m2 = (SAMPLETYPE)overlapLength; - for (i = 0; i < overlapLength ; i ++) + for (i = 0; i < overlapLength ; i ++) { pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength; m1 += 1; @@ -222,6 +217,10 @@ void TDStretch::clearInput() { inputBuffer.clear(); clearMidBuffer(); + isBeginning = true; + maxnorm = 0; + maxnormf = 1e8; + skipFract = 0; } @@ -255,7 +254,7 @@ int TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos) if (bQuickSeek) { return seekBestOverlapPositionQuick(refPos); - } + } else { return seekBestOverlapPositionFull(refPos); @@ -287,7 +286,6 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui } - // Seeks for the optimal overlap-mixing position. The 'stereo' version of the // routine // @@ -301,21 +299,23 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos) int i; double norm; - bestCorr = FLT_MIN; + bestCorr = -FLT_MAX; bestOffs = 0; // Scans for the best correlation value by testing each possible position // over the permitted range. bestCorr = calcCrossCorr(refPos, pMidBuffer, norm); + bestCorr = (bestCorr + 0.1) * 0.75; #pragma omp parallel for - for (i = 1; i < seekLength; i ++) + for (i = 1; i < seekLength; i ++) { double corr; // Calculates correlation value for the mixing position corresponding to 'i' -#ifdef _OPENMP +#if defined(_OPENMP) || defined(ST_SIMD_AVOID_UNALIGNED) // in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't // iterate the loop in sequential order + // in SIMD mode, avoid accumulator version to allow avoiding unaligned positions corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm); #else // In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same @@ -341,6 +341,11 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos) } } } + +#ifdef SOUNDTOUCH_INTEGER_SAMPLES + adaptNormalizer(); +#endif + // clear cross correlation routine state if necessary (is so e.g. in MMX routines). clearCrossCorrState(); @@ -348,64 +353,157 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos) } -// Seeks for the optimal overlap-mixing position. The 'stereo' version of the -// routine +// Quick seek algorithm for improved runtime-performance: First roughly scans through the +// correlation area, and then scan surroundings of two best preliminary correlation candidates +// with improved precision // -// The best position is determined as the position where the two overlapped -// sample sequences are 'most alike', in terms of the highest cross-correlation -// value over the overlapping period -int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos) +// Based on testing: +// - This algorithm gives on average 99% as good match as the full algorithm +// - this quick seek algorithm finds the best match on ~90% of cases +// - on those 10% of cases when this algorithm doesn't find best match, +// it still finds on average ~90% match vs. the best possible match +int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos) { - int j; +#define _MIN(a, b) (((a) < (b)) ? (a) : (b)) +#define SCANSTEP 16 +#define SCANWIND 8 + int bestOffs; - double bestCorr, corr; - int scanCount, corrOffset, tempOffset; + int i; + int bestOffs2; + float bestCorr, corr; + float bestCorr2; + double norm; + + // note: 'float' types used in this function in case that the platform would need to use software-fp - bestCorr = FLT_MIN; - bestOffs = _scanOffsets[0][0]; - corrOffset = 0; - tempOffset = 0; + bestCorr = + bestCorr2 = -FLT_MAX; + bestOffs = + bestOffs2 = SCANWIND; - // Scans for the best correlation value using four-pass hierarchical search. + // Scans for the best correlation value by testing each possible position + // over the permitted range. Look for two best matches on the first pass to + // increase possibility of ideal match. // - // The look-up table 'scans' has hierarchical position adjusting steps. - // In first pass the routine searhes for the highest correlation with - // relatively coarse steps, then rescans the neighbourhood of the highest - // correlation with better resolution and so on. - for (scanCount = 0;scanCount < 4; scanCount ++) - { - j = 0; - while (_scanOffsets[scanCount][j]) + // Begin from "SCANSTEP" instead of SCANWIND to make the calculation + // catch the 'middlepoint' of seekLength vector as that's the a-priori + // expected best match position + // + // Roughly: + // - 15% of cases find best result directly on the first round, + // - 75% cases find better match on 2nd round around the best match from 1st round + // - 10% cases find better match on 2nd round around the 2nd-best-match from 1st round + for (i = SCANSTEP; i < seekLength - SCANWIND - 1; i += SCANSTEP) + { + // Calculates correlation value for the mixing position corresponding + // to 'i' + corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm); + // heuristic rule to slightly favour values close to mid of the seek range + float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength; + corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp)); + + // Checks for the highest correlation value + if (corr > bestCorr) { - double norm; - tempOffset = corrOffset + _scanOffsets[scanCount][j]; - if (tempOffset >= seekLength) break; - - // Calculates correlation value for the mixing position corresponding - // to 'tempOffset' - corr = (double)calcCrossCorr(refPos + channels * tempOffset, pMidBuffer, norm); - // heuristic rule to slightly favour values close to mid of the range - double tmp = (double)(2 * tempOffset - seekLength) / seekLength; - corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); - - // Checks for the highest correlation value - if (corr > bestCorr) - { - bestCorr = corr; - bestOffs = tempOffset; - } - j ++; + // found new best match. keep the previous best as 2nd best match + bestCorr2 = bestCorr; + bestOffs2 = bestOffs; + bestCorr = corr; + bestOffs = i; + } + else if (corr > bestCorr2) + { + // not new best, but still new 2nd best match + bestCorr2 = corr; + bestOffs2 = i; + } + } + + // Scans surroundings of the found best match with small stepping + int end = _MIN(bestOffs + SCANWIND + 1, seekLength); + for (i = bestOffs - SCANWIND; i < end; i++) + { + if (i == bestOffs) continue; // this offset already calculated, thus skip + + // Calculates correlation value for the mixing position corresponding + // to 'i' + corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm); + // heuristic rule to slightly favour values close to mid of the range + float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength; + corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp)); + + // Checks for the highest correlation value + if (corr > bestCorr) + { + bestCorr = corr; + bestOffs = i; + } + } + + // Scans surroundings of the 2nd best match with small stepping + end = _MIN(bestOffs2 + SCANWIND + 1, seekLength); + for (i = bestOffs2 - SCANWIND; i < end; i++) + { + if (i == bestOffs2) continue; // this offset already calculated, thus skip + + // Calculates correlation value for the mixing position corresponding + // to 'i' + corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm); + // heuristic rule to slightly favour values close to mid of the range + float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength; + corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp)); + + // Checks for the highest correlation value + if (corr > bestCorr) + { + bestCorr = corr; + bestOffs = i; } - corrOffset = bestOffs; } + // clear cross correlation routine state if necessary (is so e.g. in MMX routines). clearCrossCorrState(); +#ifdef SOUNDTOUCH_INTEGER_SAMPLES + adaptNormalizer(); +#endif + return bestOffs; } + +/// For integer algorithm: adapt normalization factor divider with music so that +/// it'll not be pessimistically restrictive that can degrade quality on quieter sections +/// yet won't cause integer overflows either +void TDStretch::adaptNormalizer() +{ + // Do not adapt normalizer over too silent sequences to avoid averaging filter depleting to + // too low values during pauses in music + if ((maxnorm > 1000) || (maxnormf > 40000000)) + { + //norm averaging filter + maxnormf = 0.9f * maxnormf + 0.1f * (float)maxnorm; + + if ((maxnorm > 800000000) && (overlapDividerBitsNorm < 16)) + { + // large values, so increase divider + overlapDividerBitsNorm++; + if (maxnorm > 1600000000) overlapDividerBitsNorm++; // extra large value => extra increase + } + else if ((maxnormf < 1000000) && (overlapDividerBitsNorm > 0)) + { + // extra small values, decrease divider + overlapDividerBitsNorm--; + } + } + + maxnorm = 0; +} + + /// clear cross correlation routine state if necessary void TDStretch::clearCrossCorrState() { @@ -417,18 +515,18 @@ void TDStretch::clearCrossCorrState() void TDStretch::calcSeqParameters() { // Adjust tempo param according to tempo, so that variating processing sequence length is used - // at varius tempo settings, between the given low...top limits + // at various tempo settings, between the given low...top limits #define AUTOSEQ_TEMPO_LOW 0.5 // auto setting low tempo range (-50%) #define AUTOSEQ_TEMPO_TOP 2.0 // auto setting top tempo range (+100%) // sequence-ms setting values at above low & top tempo - #define AUTOSEQ_AT_MIN 125.0 - #define AUTOSEQ_AT_MAX 50.0 + #define AUTOSEQ_AT_MIN 90.0 + #define AUTOSEQ_AT_MAX 40.0 #define AUTOSEQ_K ((AUTOSEQ_AT_MAX - AUTOSEQ_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW)) #define AUTOSEQ_C (AUTOSEQ_AT_MIN - (AUTOSEQ_K) * (AUTOSEQ_TEMPO_LOW)) - // seek-window-ms setting values at above low & top tempo - #define AUTOSEEK_AT_MIN 25.0 + // seek-window-ms setting values at above low & top tempoq + #define AUTOSEEK_AT_MIN 20.0 #define AUTOSEEK_AT_MAX 15.0 #define AUTOSEEK_K ((AUTOSEEK_AT_MAX - AUTOSEEK_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW)) #define AUTOSEEK_C (AUTOSEEK_AT_MIN - (AUTOSEEK_K) * (AUTOSEQ_TEMPO_LOW)) @@ -464,7 +562,7 @@ void TDStretch::calcSeqParameters() // Sets new target tempo. Normal tempo = 'SCALE', smaller values represent slower // tempo, larger faster tempo. -void TDStretch::setTempo(float newTempo) +void TDStretch::setTempo(double newTempo) { int intskip; @@ -475,7 +573,7 @@ void TDStretch::setTempo(float newTempo) // Calculate ideal skip length (according to tempo value) nominalSkip = tempo * (seekWindowLength - overlapLength); - intskip = (int)(nominalSkip + 0.5f); + intskip = (int)(nominalSkip + 0.5); // Calculate how many samples are needed in the 'inputBuffer' to // process another batch of samples @@ -488,9 +586,8 @@ void TDStretch::setTempo(float newTempo) // Sets the number of channels, 1 = mono, 2 = stereo void TDStretch::setChannels(int numChannels) { - assert(numChannels > 0); - if (channels == numChannels) return; -// assert(numChannels == 1 || numChannels == 2); + if (!verifyNumberOfChannels(numChannels) || + (channels == numChannels)) return; channels = numChannels; inputBuffer.setChannels(channels); @@ -539,7 +636,8 @@ void TDStretch::processNominalTempo() // the result into 'outputBuffer' void TDStretch::processSamples() { - int ovlSkip, offset; + int ovlSkip; + int offset = 0; int temp; /* Removed this small optimization - can introduce a click to sound when tempo setting @@ -556,35 +654,62 @@ void TDStretch::processSamples() // to form a processing frame. while ((int)inputBuffer.numSamples() >= sampleReq) { - // If tempo differs from the normal ('SCALE'), scan for the best overlapping - // position - offset = seekBestOverlapPosition(inputBuffer.ptrBegin()); - - // Mix the samples in the 'inputBuffer' at position of 'offset' with the - // samples in 'midBuffer' using sliding overlapping - // ... first partially overlap with the end of the previous sequence - // (that's in 'midBuffer') - overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset); - outputBuffer.putSamples((uint)overlapLength); + if (isBeginning == false) + { + // apart from the very beginning of the track, + // scan for the best overlapping position & do overlap-add + offset = seekBestOverlapPosition(inputBuffer.ptrBegin()); + + // Mix the samples in the 'inputBuffer' at position of 'offset' with the + // samples in 'midBuffer' using sliding overlapping + // ... first partially overlap with the end of the previous sequence + // (that's in 'midBuffer') + overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset); + outputBuffer.putSamples((uint)overlapLength); + offset += overlapLength; + } + else + { + // Adjust processing offset at beginning of track by not perform initial overlapping + // and compensating that in the 'input buffer skip' calculation + isBeginning = false; + int skip = (int)(tempo * overlapLength + 0.5 * seekLength + 0.5); + + #ifdef ST_SIMD_AVOID_UNALIGNED + // in SIMD mode, round the skip amount to value corresponding to aligned memory address + if (channels == 1) + { + skip &= -4; + } + else if (channels == 2) + { + skip &= -2; + } + #endif + skipFract -= skip; + if (skipFract <= -nominalSkip) + { + skipFract = -nominalSkip; + } + } // ... then copy sequence samples from 'inputBuffer' to output: - // length of sequence - temp = (seekWindowLength - 2 * overlapLength); - // crosscheck that we don't have buffer overflow... - if ((int)inputBuffer.numSamples() < (offset + temp + overlapLength * 2)) + if ((int)inputBuffer.numSamples() < (offset + seekWindowLength - overlapLength)) { continue; // just in case, shouldn't really happen } - outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp); + // length of sequence + temp = (seekWindowLength - 2 * overlapLength); + outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * offset, (uint)temp); // Copies the end of the current sequence from 'inputBuffer' to // 'midBuffer' for being mixed with the beginning of the next // processing sequence and so on - assert((offset + temp + overlapLength * 2) <= (int)inputBuffer.numSamples()); - memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp + overlapLength), + assert((offset + temp + overlapLength) <= (int)inputBuffer.numSamples()); + memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp), channels * sizeof(SAMPLETYPE) * overlapLength); // Remove the processed samples from the input buffer. Update @@ -680,7 +805,7 @@ TDStretch * TDStretch::newInstance() ////////////////////////////////////////////////////////////////////////////// // -// Integer arithmetics specific algorithm implementations. +// Integer arithmetic specific algorithm implementations. // ////////////////////////////////////////////////////////////////////////////// @@ -694,7 +819,7 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const short temp; int cnt2; - for (i = 0; i < overlapLength ; i ++) + for (i = 0; i < overlapLength ; i ++) { temp = (short)(overlapLength - i); cnt2 = 2 * i; @@ -706,21 +831,19 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Multi' // version of the routine. -void TDStretch::overlapMulti(SAMPLETYPE *poutput, const SAMPLETYPE *input) const +void TDStretch::overlapMulti(short *poutput, const short *input) const { - SAMPLETYPE m1=(SAMPLETYPE)0; - SAMPLETYPE m2; - int i=0; + short m1; + int i = 0; - for (m2 = (SAMPLETYPE)overlapLength; m2; m2 --) + for (m1 = 0; m1 < overlapLength; m1 ++) { + short m2 = (short)(overlapLength - m1); for (int c = 0; c < channels; c ++) { poutput[i] = (input[i] * m1 + pMidBuffer[i] * m2) / overlapLength; i++; } - - m1++; } } @@ -743,13 +866,15 @@ void TDStretch::calculateOverlapLength(int aoverlapMs) // calculate overlap length so that it's power of 2 - thus it's easy to do // integer division by right-shifting. Term "-1" at end is to account for // the extra most significatnt bit left unused in result by signed multiplication - overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1; - if (overlapDividerBits > 9) overlapDividerBits = 9; - if (overlapDividerBits < 3) overlapDividerBits = 3; - newOvl = (int)pow(2.0, (int)overlapDividerBits + 1); // +1 => account for -1 above + overlapDividerBitsPure = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1; + if (overlapDividerBitsPure > 9) overlapDividerBitsPure = 9; + if (overlapDividerBitsPure < 3) overlapDividerBitsPure = 3; + newOvl = (int)pow(2.0, (int)overlapDividerBitsPure + 1); // +1 => account for -1 above acceptNewOverlapLength(newOvl); + overlapDividerBitsNorm = overlapDividerBitsPure; + // calculate sloping divider so that crosscorrelation operation won't // overflow 32-bit register. Max. sum of the crosscorrelation sum without // divider would be 2^30*(N^3-N)/3, where N = overlap length @@ -757,28 +882,40 @@ void TDStretch::calculateOverlapLength(int aoverlapMs) } -double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, double &norm) const +double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, double &norm) { long corr; - long lnorm; + unsigned long lnorm; int i; + #ifdef ST_SIMD_AVOID_UNALIGNED + // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary + if (((ulongptr)mixingPos) & 15) return -1e50; + #endif + + // hint compiler autovectorization that loop length is divisible by 8 + int ilength = (channels * overlapLength) & -8; + corr = lnorm = 0; - // Same routine for stereo and mono. For stereo, unroll loop for better - // efficiency and gives slightly better resolution against rounding. - // For mono it same routine, just unrolls loop by factor of 4 - for (i = 0; i < channels * overlapLength; i += 4) + // Same routine for stereo and mono + for (i = 0; i < ilength; i += 2) { corr += (mixingPos[i] * compare[i] + - mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow - corr += (mixingPos[i + 2] * compare[i + 2] + - mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits; + mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; lnorm += (mixingPos[i] * mixingPos[i] + - mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow - lnorm += (mixingPos[i + 2] * mixingPos[i + 2] + - mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBits; + mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm; + // do intermediate scalings to avoid integer overflow } + if (lnorm > maxnorm) + { + // modify 'maxnorm' inside critical section to avoid multi-access conflict if in OpenMP mode + #pragma omp critical + if (lnorm > maxnorm) + { + maxnorm = lnorm; + } + } // Normalize result by dividing by sqrt(norm) - this step is easiest // done using floating point operation norm = (double)lnorm; @@ -787,38 +924,42 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value -double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm) const +double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm) { long corr; long lnorm; int i; + // hint compiler autovectorization that loop length is divisible by 8 + int ilength = (channels * overlapLength) & -8; + // cancel first normalizer tap from previous round lnorm = 0; for (i = 1; i <= channels; i ++) { - lnorm -= (mixingPos[-i] * mixingPos[-i]) >> overlapDividerBits; + lnorm -= (mixingPos[-i] * mixingPos[-i]) >> overlapDividerBitsNorm; } corr = 0; - // Same routine for stereo and mono. For stereo, unroll loop for better - // efficiency and gives slightly better resolution against rounding. - // For mono it same routine, just unrolls loop by factor of 4 - for (i = 0; i < channels * overlapLength; i += 4) + // Same routine for stereo and mono. + for (i = 0; i < ilength; i += 2) { corr += (mixingPos[i] * compare[i] + - mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits; // notice: do intermediate division here to avoid integer overflow - corr += (mixingPos[i + 2] * compare[i + 2] + - mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits; + mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; } // update normalizer with last samples of this round for (int j = 0; j < channels; j ++) { i --; - lnorm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBits; + lnorm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBitsNorm; } + norm += (double)lnorm; + if (norm > maxnorm) + { + maxnorm = (unsigned long)norm; + } // Normalize result by dividing by sqrt(norm) - this step is easiest // done using floating point operation @@ -829,7 +970,7 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c ////////////////////////////////////////////////////////////////////////////// // -// Floating point arithmetics specific algorithm implementations. +// Floating point arithmetic specific algorithm implementations. // #ifdef SOUNDTOUCH_FLOAT_SAMPLES @@ -903,29 +1044,26 @@ void TDStretch::calculateOverlapLength(int overlapInMsec) /// Calculate cross-correlation -double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm) const +double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm) { - double corr; - double norm; + float corr; + float norm; int i; - corr = norm = 0; - // Same routine for stereo and mono. For Stereo, unroll by factor of 2. - // For mono it's same routine yet unrollsd by factor of 4. - for (i = 0; i < channels * overlapLength; i += 4) - { - corr += mixingPos[i] * compare[i] + - mixingPos[i + 1] * compare[i + 1]; - - norm += mixingPos[i] * mixingPos[i] + - mixingPos[i + 1] * mixingPos[i + 1]; + #ifdef ST_SIMD_AVOID_UNALIGNED + // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary + if (((ulongptr)mixingPos) & 15) return -1e50; + #endif - // unroll the loop for better CPU efficiency: - corr += mixingPos[i + 2] * compare[i + 2] + - mixingPos[i + 3] * compare[i + 3]; + // hint compiler autovectorization that loop length is divisible by 8 + int ilength = (channels * overlapLength) & -8; - norm += mixingPos[i + 2] * mixingPos[i + 2] + - mixingPos[i + 3] * mixingPos[i + 3]; + corr = norm = 0; + // Same routine for stereo and mono + for (i = 0; i < ilength; i ++) + { + corr += mixingPos[i] * compare[i]; + norm += mixingPos[i] * mixingPos[i]; } anorm = norm; @@ -934,9 +1072,9 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value -double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm) const +double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm) { - double corr; + float corr; int i; corr = 0; @@ -947,14 +1085,13 @@ double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *c norm -= mixingPos[-i] * mixingPos[-i]; } - // Same routine for stereo and mono. For Stereo, unroll by factor of 2. - // For mono it's same routine yet unrollsd by factor of 4. - for (i = 0; i < channels * overlapLength; i += 4) + // hint compiler autovectorization that loop length is divisible by 8 + int ilength = (channels * overlapLength) & -8; + + // Same routine for stereo and mono + for (i = 0; i < ilength; i ++) { - corr += mixingPos[i] * compare[i] + - mixingPos[i + 1] * compare[i + 1] + - mixingPos[i + 2] * compare[i + 2] + - mixingPos[i + 3] * compare[i + 3]; + corr += mixingPos[i] * compare[i]; } // update normalizer with last samples of this round |