Audio resampler update to add S16 filters

This does not affect the existing resamplers.
New resampler accessed through additional quality settings:

DYN_LOW_QUALITY = 5
DYN_MED_QUALITY = 6
DYN_HIGH_QUALITY = 7

Change-Id: Iebbd31871e808a4a6dee3f3abfd7e9dcf77c48e1
Signed-off-by: Andy Hung <hunga@google.com>
diff --git a/services/audioflinger/Android.mk b/services/audioflinger/Android.mk
index 3ec9285..4524d3c 100644
--- a/services/audioflinger/Android.mk
+++ b/services/audioflinger/Android.mk
@@ -23,7 +23,8 @@
     AudioPolicyService.cpp      \
     ServiceUtilities.cpp        \
     AudioResamplerCubic.cpp.arm \
-    AudioResamplerSinc.cpp.arm
+    AudioResamplerSinc.cpp.arm  \
+    AudioResamplerDyn.cpp.arm
 
 LOCAL_SRC_FILES += StateQueue.cpp
 
@@ -74,10 +75,11 @@
 include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES:=               \
-	test-resample.cpp 			\
+    test-resample.cpp           \
     AudioResampler.cpp.arm      \
-	AudioResamplerCubic.cpp.arm \
-    AudioResamplerSinc.cpp.arm
+    AudioResamplerCubic.cpp.arm \
+    AudioResamplerSinc.cpp.arm  \
+    AudioResamplerDyn.cpp.arm
 
 LOCAL_C_INCLUDES := \
     $(call include-path-for, audio-utils)
diff --git a/services/audioflinger/AudioResampler.cpp b/services/audioflinger/AudioResampler.cpp
index 323f1a4..3b5a8c1 100644
--- a/services/audioflinger/AudioResampler.cpp
+++ b/services/audioflinger/AudioResampler.cpp
@@ -25,6 +25,7 @@
 #include "AudioResampler.h"
 #include "AudioResamplerSinc.h"
 #include "AudioResamplerCubic.h"
+#include "AudioResamplerDyn.h"
 
 #ifdef __arm__
 #include <machine/cpu-features.h>
@@ -85,6 +86,9 @@
     case MED_QUALITY:
     case HIGH_QUALITY:
     case VERY_HIGH_QUALITY:
+    case DYN_LOW_QUALITY:
+    case DYN_MED_QUALITY:
+    case DYN_HIGH_QUALITY:
         return true;
     default:
         return false;
@@ -105,7 +109,7 @@
         if (*endptr == '\0') {
             defaultQuality = (src_quality) l;
             ALOGD("forcing AudioResampler quality to %d", defaultQuality);
-            if (defaultQuality < DEFAULT_QUALITY || defaultQuality > VERY_HIGH_QUALITY) {
+            if (defaultQuality < DEFAULT_QUALITY || defaultQuality > DYN_HIGH_QUALITY) {
                 defaultQuality = DEFAULT_QUALITY;
             }
         }
@@ -125,6 +129,12 @@
         return 20;
     case VERY_HIGH_QUALITY:
         return 34;
+    case DYN_LOW_QUALITY:
+        return 4;
+    case DYN_MED_QUALITY:
+        return 6;
+    case DYN_HIGH_QUALITY:
+        return 12;
     }
 }
 
@@ -175,6 +185,15 @@
         case VERY_HIGH_QUALITY:
             quality = HIGH_QUALITY;
             break;
+        case DYN_LOW_QUALITY:
+            atFinalQuality = true;
+            break;
+        case DYN_MED_QUALITY:
+            quality = DYN_LOW_QUALITY;
+            break;
+        case DYN_HIGH_QUALITY:
+            quality = DYN_MED_QUALITY;
+            break;
         }
     }
     pthread_mutex_unlock(&mutex);
@@ -200,6 +219,12 @@
         ALOGV("Create VERY_HIGH_QUALITY sinc Resampler = %d", quality);
         resampler = new AudioResamplerSinc(bitDepth, inChannelCount, sampleRate, quality);
         break;
+    case DYN_LOW_QUALITY:
+    case DYN_MED_QUALITY:
+    case DYN_HIGH_QUALITY:
+        ALOGV("Create dynamic Resampler = %d", quality);
+        resampler = new AudioResamplerDyn(bitDepth, inChannelCount, sampleRate, quality);
+        break;
     }
 
     // initialize resampler
diff --git a/services/audioflinger/AudioResampler.h b/services/audioflinger/AudioResampler.h
index 33e64ce..c341325 100644
--- a/services/audioflinger/AudioResampler.h
+++ b/services/audioflinger/AudioResampler.h
@@ -41,6 +41,9 @@
         MED_QUALITY=2,
         HIGH_QUALITY=3,
         VERY_HIGH_QUALITY=4,
+        DYN_LOW_QUALITY=5,
+        DYN_MED_QUALITY=6,
+        DYN_HIGH_QUALITY=7,
     };
 
     static AudioResampler* create(int bitDepth, int inChannelCount,
diff --git a/services/audioflinger/AudioResamplerDyn.cpp b/services/audioflinger/AudioResamplerDyn.cpp
new file mode 100644
index 0000000..1e38d3e
--- /dev/null
+++ b/services/audioflinger/AudioResamplerDyn.cpp
@@ -0,0 +1,530 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define LOG_TAG "AudioResamplerDyn"
+//#define LOG_NDEBUG 0
+
+#include <malloc.h>
+#include <string.h>
+#include <stdlib.h>
+#include <dlfcn.h>
+#include <math.h>
+
+#include <cutils/compiler.h>
+#include <cutils/properties.h>
+#include <utils/Log.h>
+
+#include "AudioResamplerFirOps.h" // USE_NEON and USE_INLINE_ASSEMBLY defined here
+#include "AudioResamplerFirProcess.h"
+#include "AudioResamplerFirProcessNeon.h"
+#include "AudioResamplerFirGen.h" // requires math.h
+#include "AudioResamplerDyn.h"
+
+//#define DEBUG_RESAMPLER
+
+namespace android {
+
+// generate a unique resample type compile-time constant (constexpr)
+#define RESAMPLETYPE(CHANNELS, LOCKED, STRIDE, COEFTYPE) \
+    ((((CHANNELS)-1)&1) | !!(LOCKED)<<1 | (COEFTYPE)<<2 \
+    | ((STRIDE)==8 ? 1 : (STRIDE)==16 ? 2 : 0)<<3)
+
+/*
+ * InBuffer is a type agnostic input buffer.
+ *
+ * Layout of the state buffer for halfNumCoefs=8.
+ *
+ * [rrrrrrppppppppnnnnnnnnrrrrrrrrrrrrrrrrrrr.... rrrrrrr]
+ *  S            I                                R
+ *
+ * S = mState
+ * I = mImpulse
+ * R = mRingFull
+ * p = past samples, convoluted with the (p)ositive side of sinc()
+ * n = future samples, convoluted with the (n)egative side of sinc()
+ * r = extra space for implementing the ring buffer
+ */
+
+template<typename TI>
+AudioResamplerDyn::InBuffer<TI>::InBuffer()
+    : mState(NULL), mImpulse(NULL), mRingFull(NULL), mStateSize(0) {
+}
+
+template<typename TI>
+AudioResamplerDyn::InBuffer<TI>::~InBuffer() {
+    init();
+}
+
+template<typename TI>
+void AudioResamplerDyn::InBuffer<TI>::init() {
+    free(mState);
+    mState = NULL;
+    mImpulse = NULL;
+    mRingFull = NULL;
+    mStateSize = 0;
+}
+
+// resizes the state buffer to accommodate the appropriate filter length
+template<typename TI>
+void AudioResamplerDyn::InBuffer<TI>::resize(int CHANNELS, int halfNumCoefs) {
+    // calculate desired state size
+    int stateSize = halfNumCoefs * CHANNELS * 2
+            * kStateSizeMultipleOfFilterLength;
+
+    // check if buffer needs resizing
+    if (mState
+            && stateSize == mStateSize
+            && mRingFull-mState == mStateSize-halfNumCoefs*CHANNELS) {
+        return;
+    }
+
+    // create new buffer
+    TI* state = (int16_t*)memalign(32, stateSize*sizeof(*state));
+    memset(state, 0, stateSize*sizeof(*state));
+
+    // attempt to preserve state
+    if (mState) {
+        TI* srcLo = mImpulse - halfNumCoefs*CHANNELS;
+        TI* srcHi = mImpulse + halfNumCoefs*CHANNELS;
+        TI* dst = state;
+
+        if (srcLo < mState) {
+            dst += mState-srcLo;
+            srcLo = mState;
+        }
+        if (srcHi > mState + mStateSize) {
+            srcHi = mState + mStateSize;
+        }
+        memcpy(dst, srcLo, (srcHi - srcLo) * sizeof(*srcLo));
+        free(mState);
+    }
+
+    // set class member vars
+    mState = state;
+    mStateSize = stateSize;
+    mImpulse = mState + halfNumCoefs*CHANNELS; // actually one sample greater than needed
+    mRingFull = mState + mStateSize - halfNumCoefs*CHANNELS;
+}
+
+// copy in the input data into the head (impulse+halfNumCoefs) of the buffer.
+template<typename TI>
+template<int CHANNELS>
+void AudioResamplerDyn::InBuffer<TI>::readAgain(TI*& impulse, const int halfNumCoefs,
+        const TI* const in, const size_t inputIndex) {
+    int16_t* head = impulse + halfNumCoefs*CHANNELS;
+    for (size_t i=0 ; i<CHANNELS ; i++) {
+        head[i] = in[inputIndex*CHANNELS + i];
+    }
+}
+
+// advance the impulse pointer, and load in data into the head (impulse+halfNumCoefs)
+template<typename TI>
+template<int CHANNELS>
+void AudioResamplerDyn::InBuffer<TI>::readAdvance(TI*& impulse, const int halfNumCoefs,
+        const TI* const in, const size_t inputIndex) {
+    impulse += CHANNELS;
+
+    if (CC_UNLIKELY(impulse >= mRingFull)) {
+        const size_t shiftDown = mRingFull - mState - halfNumCoefs*CHANNELS;
+        memcpy(mState, mState+shiftDown, halfNumCoefs*CHANNELS*2*sizeof(TI));
+        impulse -= shiftDown;
+    }
+    readAgain<CHANNELS>(impulse, halfNumCoefs, in, inputIndex);
+}
+
+void AudioResamplerDyn::Constants::set(
+        int L, int halfNumCoefs, int inSampleRate, int outSampleRate)
+{
+    int bits = 0;
+    int lscale = inSampleRate/outSampleRate < 2 ? L - 1 :
+            static_cast<int>(static_cast<uint64_t>(L)*inSampleRate/outSampleRate);
+    for (int i=lscale; i; ++bits, i>>=1)
+        ;
+    mL = L;
+    mShift = kNumPhaseBits - bits;
+    mHalfNumCoefs = halfNumCoefs;
+}
+
+AudioResamplerDyn::AudioResamplerDyn(int bitDepth,
+        int inChannelCount, int32_t sampleRate, src_quality quality)
+    : AudioResampler(bitDepth, inChannelCount, sampleRate, quality),
+    mResampleType(0), mFilterSampleRate(0), mCoefBuffer(NULL)
+{
+    mVolumeSimd[0] = mVolumeSimd[1] = 0;
+    mConstants.set(128, 8, mSampleRate, mSampleRate); // TODO: set better
+}
+
+AudioResamplerDyn::~AudioResamplerDyn() {
+    free(mCoefBuffer);
+}
+
+void AudioResamplerDyn::init() {
+    mFilterSampleRate = 0; // always trigger new filter generation
+    mInBuffer.init();
+}
+
+void AudioResamplerDyn::setVolume(int16_t left, int16_t right) {
+    AudioResampler::setVolume(left, right);
+    mVolumeSimd[0] = static_cast<int32_t>(left)<<16;
+    mVolumeSimd[1] = static_cast<int32_t>(right)<<16;
+}
+
+template <typename T> T max(T a, T b) {return a > b ? a : b;}
+
+template <typename T> T absdiff(T a, T b) {return a > b ? a - b : b - a;}
+
+template<typename T>
+void AudioResamplerDyn::createKaiserFir(Constants &c, double stopBandAtten,
+        int inSampleRate, int outSampleRate, double tbwCheat) {
+    T* buf = reinterpret_cast<T*>(memalign(32, (c.mL+1)*c.mHalfNumCoefs*sizeof(T)));
+    static const double atten = 0.9998;   // to avoid ripple overflow
+    double fcr;
+    double tbw = firKaiserTbw(c.mHalfNumCoefs, stopBandAtten);
+
+    if (inSampleRate < outSampleRate) { // upsample
+        fcr = max(0.5*tbwCheat - tbw/2, tbw/2);
+    } else { // downsample
+        fcr = max(0.5*tbwCheat*outSampleRate/inSampleRate - tbw/2, tbw/2);
+    }
+    // create and set filter
+    firKaiserGen(buf, c.mL, c.mHalfNumCoefs, stopBandAtten, fcr, atten);
+    c.setBuf(buf);
+    if (mCoefBuffer) {
+        free(mCoefBuffer);
+    }
+    mCoefBuffer = buf;
+#ifdef DEBUG_RESAMPLER
+    // print basic filter stats
+    printf("L:%d  hnc:%d  stopBandAtten:%lf  fcr:%lf  atten:%lf  tbw:%lf\n",
+            c.mL, c.mHalfNumCoefs, stopBandAtten, fcr, atten, tbw);
+    // test the filter and report results
+    double fp = (fcr - tbw/2)/c.mL;
+    double fs = (fcr + tbw/2)/c.mL;
+    double fmin, fmax;
+    testFir(buf, c.mL, c.mHalfNumCoefs, 0., fp, 100, fmin, fmax);
+    double d1 = (fmax - fmin)/2.;
+    double ap = -20.*log10(1. - d1); // passband ripple
+    printf("passband(%lf, %lf): %.8lf %.8lf %.8lf\n", 0., fp, (fmax + fmin)/2., d1, ap);
+    testFir(buf, c.mL, c.mHalfNumCoefs, fs, 0.5, 100, fmin, fmax);
+    double d2 = fmax;
+    double as = -20.*log10(d2); // stopband attenuation
+    printf("stopband(%lf, %lf): %.8lf %.8lf %.3lf\n", fs, 0.5, (fmax + fmin)/2., d2, as);
+#endif
+}
+
+// recursive gcd (TODO: verify tail recursion elimination should make this iterate)
+static int gcd(int n, int m) {
+    if (m == 0) {
+        return n;
+    }
+    return gcd(m, n % m);
+}
+
+static bool isClose(int32_t newSampleRate, int32_t prevSampleRate, int32_t filterSampleRate) {
+    int pdiff = absdiff(newSampleRate, prevSampleRate);
+    int adiff = absdiff(newSampleRate, filterSampleRate);
+
+    // allow up to 6% relative change increments.
+    // allow up to 12% absolute change increments (from filter design)
+    return pdiff < prevSampleRate>>4 && adiff < filterSampleRate>>3;
+}
+
+void AudioResamplerDyn::setSampleRate(int32_t inSampleRate) {
+    if (mInSampleRate == inSampleRate) {
+        return;
+    }
+    int32_t oldSampleRate = mInSampleRate;
+    int32_t oldHalfNumCoefs = mConstants.mHalfNumCoefs;
+    uint32_t oldPhaseWrapLimit = mConstants.mL << mConstants.mShift;
+    bool useS32 = false;
+
+    mInSampleRate = inSampleRate;
+
+    // TODO: Add precalculated Equiripple filters
+
+    if (!isClose(inSampleRate, oldSampleRate, mFilterSampleRate)) {
+        mFilterSampleRate = inSampleRate;
+
+        // Begin Kaiser Filter computation
+        //
+        // The quantization floor for S16 is about 96db - 10*log_10(#length) + 3dB.
+        // Keep the stop band attenuation no greater than 84-85dB for 32 length S16 filters
+        //
+        // For s32 we keep the stop band attenuation at the same as 16b resolution, about
+        // 96-98dB
+        //
+
+        double stopBandAtten;
+        double tbwCheat = 1.; // how much we "cheat" into aliasing
+        int halfLength;
+        if (getQuality() == DYN_HIGH_QUALITY) {
+            // 32b coefficients, 64 length
+            useS32 = true;
+            stopBandAtten = 98.;
+            halfLength = 32;
+        } else if (getQuality() == DYN_LOW_QUALITY) {
+            // 16b coefficients, 16-32 length
+            useS32 = false;
+            stopBandAtten = 80.;
+            if (mSampleRate >= inSampleRate * 2) {
+                halfLength = 16;
+            } else {
+                halfLength = 8;
+            }
+            if (mSampleRate >= inSampleRate) {
+                tbwCheat = 1.05;
+            } else {
+                tbwCheat = 1.03;
+            }
+        } else { // medium quality
+            // 16b coefficients, 32-64 length
+            useS32 = false;
+            stopBandAtten = 84.;
+            if (mSampleRate >= inSampleRate * 4) {
+                halfLength = 32;
+            } else if (mSampleRate >= inSampleRate * 2) {
+                halfLength = 24;
+            } else {
+                halfLength = 16;
+            }
+            if (mSampleRate >= inSampleRate) {
+                tbwCheat = 1.03;
+            } else {
+                tbwCheat = 1.01;
+            }
+        }
+
+        // determine the number of polyphases in the filterbank.
+        // for 16b, it is desirable to have 2^(16/2) = 256 phases.
+        // https://ccrma.stanford.edu/~jos/resample/Relation_Interpolation_Error_Quantization.html
+        //
+        // We are a bit more lax on this.
+
+        int phases = mSampleRate / gcd(mSampleRate, inSampleRate);
+
+        while (phases<63) { // too few phases, allow room for interpolation
+            phases *= 2; // this code only needed to support dynamic rate changes
+        }
+        if (phases>=256) {  // too many phases, always interpolate
+            phases = 127;
+        }
+
+        // create the filter
+        mConstants.set(phases, halfLength, inSampleRate, mSampleRate);
+        if (useS32) {
+            createKaiserFir<int32_t>(mConstants, stopBandAtten,
+                    inSampleRate, mSampleRate, tbwCheat);
+        } else {
+            createKaiserFir<int16_t>(mConstants, stopBandAtten,
+                    inSampleRate, mSampleRate, tbwCheat);
+        }
+    } // End Kaiser filter
+
+    // update phase and state based on the new filter.
+    const Constants& c(mConstants);
+    mInBuffer.resize(mChannelCount, c.mHalfNumCoefs);
+    const uint32_t phaseWrapLimit = c.mL << c.mShift;
+    // try to preserve as much of the phase fraction as possible for on-the-fly changes
+    mPhaseFraction = static_cast<unsigned long long>(mPhaseFraction)
+            * phaseWrapLimit / oldPhaseWrapLimit;
+    mPhaseFraction %= phaseWrapLimit; // should not do anything, but just in case.
+    mPhaseIncrement = static_cast<uint32_t>(static_cast<double>(phaseWrapLimit)
+            * inSampleRate / mSampleRate);
+
+    // determine which resampler to use
+    // check if locked phase (works only if mPhaseIncrement has no "fractional phase bits")
+    int locked = (mPhaseIncrement << (sizeof(mPhaseIncrement)*8 - c.mShift)) == 0;
+    int stride = (c.mHalfNumCoefs&7)==0 ? 16 : (c.mHalfNumCoefs&3)==0 ? 8 : 2;
+    if (locked) {
+        mPhaseFraction = mPhaseFraction >> c.mShift << c.mShift; // remove fractional phase
+    }
+    if (!USE_NEON) {
+        stride = 2; // C version only
+    }
+    // TODO: Remove this for testing
+    //stride = 2;
+    mResampleType = RESAMPLETYPE(mChannelCount, locked, stride, !!useS32);
+#ifdef DEBUG_RESAMPLER
+    printf("channels:%d  %s  stride:%d  %s  coef:%d  shift:%d\n",
+            mChannelCount, locked ? "locked" : "interpolated",
+            stride, useS32 ? "S32" : "S16", 2*c.mHalfNumCoefs, c.mShift);
+#endif
+}
+
+void AudioResamplerDyn::resample(int32_t* out, size_t outFrameCount,
+            AudioBufferProvider* provider)
+{
+    // TODO:
+    // 24 cases - this perhaps can be reduced later, as testing might take too long
+    switch (mResampleType) {
+
+    // stride 16 (stride 2 for machines that do not support NEON)
+    case RESAMPLETYPE(1, true, 16, 0):
+        return resample<1, true, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, true, 16, 0):
+        return resample<2, true, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, false, 16, 0):
+        return resample<1, false, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, false, 16, 0):
+        return resample<2, false, 16>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, true, 16, 1):
+        return resample<1, true, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, true, 16, 1):
+        return resample<2, true, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(1, false, 16, 1):
+        return resample<1, false, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, false, 16, 1):
+        return resample<2, false, 16>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+#if 0
+    // TODO: Remove these?
+    // stride 8
+    case RESAMPLETYPE(1, true, 8, 0):
+        return resample<1, true, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, true, 8, 0):
+        return resample<2, true, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, false, 8, 0):
+        return resample<1, false, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, false, 8, 0):
+        return resample<2, false, 8>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, true, 8, 1):
+        return resample<1, true, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, true, 8, 1):
+        return resample<2, true, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(1, false, 8, 1):
+        return resample<1, false, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, false, 8, 1):
+        return resample<2, false, 8>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    // stride 2 (can handle any filter length)
+    case RESAMPLETYPE(1, true, 2, 0):
+        return resample<1, true, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, true, 2, 0):
+        return resample<2, true, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, false, 2, 0):
+        return resample<1, false, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(2, false, 2, 0):
+        return resample<2, false, 2>(out, outFrameCount, mConstants.mFirCoefsS16, provider);
+    case RESAMPLETYPE(1, true, 2, 1):
+        return resample<1, true, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, true, 2, 1):
+        return resample<2, true, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(1, false, 2, 1):
+        return resample<1, false, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+    case RESAMPLETYPE(2, false, 2, 1):
+        return resample<2, false, 2>(out, outFrameCount, mConstants.mFirCoefsS32, provider);
+#endif
+    default:
+        ; // error
+    }
+}
+
+template<int CHANNELS, bool LOCKED, int STRIDE, typename TC>
+void AudioResamplerDyn::resample(int32_t* out, size_t outFrameCount,
+        const TC* const coefs,  AudioBufferProvider* provider)
+{
+    const Constants& c(mConstants);
+    int16_t* impulse = mInBuffer.getImpulse();
+    size_t inputIndex = mInputIndex;
+    uint32_t phaseFraction = mPhaseFraction;
+    const uint32_t phaseIncrement = mPhaseIncrement;
+    size_t outputIndex = 0;
+    size_t outputSampleCount = outFrameCount * 2;   // stereo output
+    size_t inFrameCount = (outFrameCount*mInSampleRate)/mSampleRate;
+    const uint32_t phaseWrapLimit = c.mL << c.mShift;
+
+    // NOTE: be very careful when modifying the code here. register
+    // pressure is very high and a small change might cause the compiler
+    // to generate far less efficient code.
+    // Always sanity check the result with objdump or test-resample.
+
+    // the following logic is a bit convoluted to keep the main processing loop
+    // as tight as possible with register allocation.
+    while (outputIndex < outputSampleCount) {
+        // buffer is empty, fetch a new one
+        while (mBuffer.frameCount == 0) {
+            mBuffer.frameCount = inFrameCount;
+            provider->getNextBuffer(&mBuffer,
+                    calculateOutputPTS(outputIndex / 2));
+            if (mBuffer.raw == NULL) {
+                goto resample_exit;
+            }
+            if (phaseFraction >= phaseWrapLimit) { // read in data
+                mInBuffer.readAdvance<CHANNELS>(
+                        impulse, c.mHalfNumCoefs, mBuffer.i16, inputIndex);
+                phaseFraction -= phaseWrapLimit;
+                while (phaseFraction >= phaseWrapLimit) {
+                    inputIndex++;
+                    if (inputIndex >= mBuffer.frameCount) {
+                        inputIndex -= mBuffer.frameCount;
+                        provider->releaseBuffer(&mBuffer);
+                        break;
+                    }
+                    mInBuffer.readAdvance<CHANNELS>(
+                            impulse, c.mHalfNumCoefs, mBuffer.i16, inputIndex);
+                    phaseFraction -= phaseWrapLimit;
+                }
+            }
+        }
+        const int16_t* const in = mBuffer.i16;
+        const size_t frameCount = mBuffer.frameCount;
+        const int coefShift = c.mShift;
+        const int halfNumCoefs = c.mHalfNumCoefs;
+        const int32_t* const volumeSimd = mVolumeSimd;
+
+        // reread the last input in.
+        mInBuffer.readAgain<CHANNELS>(impulse, halfNumCoefs, in, inputIndex);
+
+        // main processing loop
+        while (CC_LIKELY(outputIndex < outputSampleCount)) {
+            // caution: fir() is inlined and may be large.
+            // output will be loaded with the appropriate values
+            //
+            // from the input samples in impulse[-halfNumCoefs+1]... impulse[halfNumCoefs]
+            // from the polyphase filter of (phaseFraction / phaseWrapLimit) in coefs.
+            //
+            fir<CHANNELS, LOCKED, STRIDE>(
+                    &out[outputIndex],
+                    phaseFraction, phaseWrapLimit,
+                    coefShift, halfNumCoefs, coefs,
+                    impulse, volumeSimd);
+            outputIndex += 2;
+
+            phaseFraction += phaseIncrement;
+            while (phaseFraction >= phaseWrapLimit) {
+                inputIndex++;
+                if (inputIndex >= frameCount) {
+                    goto done;  // need a new buffer
+                }
+                mInBuffer.readAdvance<CHANNELS>(impulse, halfNumCoefs, in, inputIndex);
+                phaseFraction -= phaseWrapLimit;
+            }
+        }
+done:
+        // often arrives here when input buffer runs out
+        if (inputIndex >= frameCount) {
+            inputIndex -= frameCount;
+            provider->releaseBuffer(&mBuffer);
+            // mBuffer.frameCount MUST be zero here.
+        }
+    }
+
+resample_exit:
+    mInBuffer.setImpulse(impulse);
+    mInputIndex = inputIndex;
+    mPhaseFraction = phaseFraction;
+}
+
+// ----------------------------------------------------------------------------
+}; // namespace android
diff --git a/services/audioflinger/AudioResamplerDyn.h b/services/audioflinger/AudioResamplerDyn.h
new file mode 100644
index 0000000..85a01ab
--- /dev/null
+++ b/services/audioflinger/AudioResamplerDyn.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_DYN_H
+#define ANDROID_AUDIO_RESAMPLER_DYN_H
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <cutils/log.h>
+
+#include "AudioResampler.h"
+
+namespace android {
+
+class AudioResamplerDyn: public AudioResampler {
+public:
+    AudioResamplerDyn(int bitDepth, int inChannelCount, int32_t sampleRate,
+            src_quality quality);
+
+    virtual ~AudioResamplerDyn();
+
+    virtual void init();
+
+    virtual void setSampleRate(int32_t inSampleRate);
+
+    virtual void setVolume(int16_t left, int16_t right);
+
+    virtual void resample(int32_t* out, size_t outFrameCount,
+            AudioBufferProvider* provider);
+
+private:
+
+    class Constants { // stores the filter constants.
+    public:
+        Constants() :
+            mL(0), mShift(0), mHalfNumCoefs(0), mFirCoefsS16(NULL)
+        {}
+        void set(int L, int halfNumCoefs,
+                int inSampleRate, int outSampleRate);
+        inline void setBuf(int16_t* buf) {
+            mFirCoefsS16 = buf;
+        }
+        inline void setBuf(int32_t* buf) {
+            mFirCoefsS32 = buf;
+        }
+
+        int mL;       // interpolation phases in the filter.
+        int mShift;   // right shift to get polyphase index
+        unsigned int mHalfNumCoefs; // filter half #coefs
+        union {       // polyphase filter bank
+            const int16_t* mFirCoefsS16;
+            const int32_t* mFirCoefsS32;
+        };
+    };
+
+    // Input buffer management for a given input type TI, now (int16_t)
+    // Is agnostic of the actual type, can work with int32_t and float.
+    template<typename TI>
+    class InBuffer {
+    public:
+        InBuffer();
+        ~InBuffer();
+        void init();
+        void resize(int CHANNELS, int halfNumCoefs);
+
+        // used for direct management of the mImpulse pointer
+        inline TI* getImpulse() {
+            return mImpulse;
+        }
+        inline void setImpulse(TI *impulse) {
+            mImpulse = impulse;
+        }
+        template<int CHANNELS>
+        inline void readAgain(TI*& impulse, const int halfNumCoefs,
+                const TI* const in, const size_t inputIndex);
+        template<int CHANNELS>
+        inline void readAdvance(TI*& impulse, const int halfNumCoefs,
+                const TI* const in, const size_t inputIndex);
+
+    private:
+        // tuning parameter guidelines: 2 <= multiple <= 8
+        static const int kStateSizeMultipleOfFilterLength = 4;
+
+        TI* mState;    // base pointer for the input buffer storage
+        TI* mImpulse;  // current location of the impulse response (centered)
+        TI* mRingFull; // mState <= mImpulse < mRingFull
+        // in general, mRingFull = mState + mStateSize - halfNumCoefs*CHANNELS.
+        size_t mStateSize; // in units of TI.
+    };
+
+    template<int CHANNELS, bool LOCKED, int STRIDE, typename TC>
+    void resample(int32_t* out, size_t outFrameCount,
+            const TC* const coefs, AudioBufferProvider* provider);
+
+    template<typename T>
+    void createKaiserFir(Constants &c, double stopBandAtten,
+            int inSampleRate, int outSampleRate, double tbwCheat);
+
+    InBuffer<int16_t> mInBuffer;
+    Constants mConstants;  // current set of coefficient parameters
+    int32_t __attribute__ ((aligned (8))) mVolumeSimd[2];
+    int32_t mResampleType; // contains the resample type.
+    int32_t mFilterSampleRate; // designed sample rate for the filter
+    void* mCoefBuffer; // if a filter is created, this is not null
+};
+
+// ----------------------------------------------------------------------------
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_DYN_H*/
diff --git a/services/audioflinger/AudioResamplerFirGen.h b/services/audioflinger/AudioResamplerFirGen.h
new file mode 100644
index 0000000..fa6ee3e
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirGen.h
@@ -0,0 +1,307 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_GEN_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_GEN_H
+
+namespace android {
+
+/*
+ * Sinc function is the traditional variant.
+ *
+ * TODO: Investigate optimizations (regular sampling grid, NEON vector accelerations)
+ * TODO: Remove comparison at 0 and trap at a higher level.
+ *
+ */
+
+static inline double sinc(double x) {
+    if (fabs(x) < FLT_MIN) {
+        return 1.;
+    }
+    return sin(x) / x;
+}
+
+static inline double sqr(double x) {
+    return x * x;
+}
+
+/*
+ * rounds a double to the nearest integer for FIR coefficients.
+ *
+ * One variant uses noise shaping, which must keep error history
+ * to work (the err parameter, initialized to 0).
+ * The other variant is a non-noise shaped version for
+ * S32 coefficients (noise shaping doesn't gain much).
+ *
+ * Caution: No bounds saturation is applied, but isn't needed in
+ * this case.
+ *
+ * @param x is the value to round.
+ *
+ * @param maxval is the maximum integer scale factor expressed as an int64 (for headroom).
+ * Typically this may be the maximum positive integer+1 (using the fact that double precision
+ * FIR coefficients generated here are never that close to 1.0 to pose an overflow condition).
+ *
+ * @param err is the previous error (actual - rounded) for the previous rounding op.
+ *
+ */
+
+static inline int64_t toint(double x, int64_t maxval, double& err) {
+    double val = x * maxval;
+    double ival = floor(val + 0.5 + err*0.17);
+    err = val - ival;
+    return static_cast<int64_t>(ival);
+}
+
+static inline int64_t toint(double x, int64_t maxval) {
+    return static_cast<int64_t>(floor(x * maxval + 0.5));
+}
+
+/*
+ * Modified Bessel function of the first kind
+ * http://en.wikipedia.org/wiki/Bessel_function
+ *
+ * The formulas are taken from Abramowitz and Stegun:
+ *
+ * http://people.math.sfu.ca/~cbm/aands/page_375.htm
+ * http://people.math.sfu.ca/~cbm/aands/page_378.htm
+ *
+ * http://dlmf.nist.gov/10.25
+ * http://dlmf.nist.gov/10.40
+ *
+ * Note we assume x is nonnegative (the function is symmetric,
+ * pass in the absolute value as needed).
+ *
+ * Constants are compile time derived with templates I0Term<> and
+ * I0ATerm<> to the precision of the compiler.  The series can be expanded
+ * to any precision needed, but currently set around 24b precision.
+ *
+ * We use a bit of template math here, constexpr would probably be
+ * more appropriate for a C++11 compiler.
+ *
+ */
+
+template <int N>
+struct I0Term {
+    static const double value = I0Term<N-1>::value/ (4. * N * N);
+};
+
+template <>
+struct I0Term<0> {
+    static const double value = 1.;
+};
+
+template <int N>
+struct I0ATerm {
+    static const double value = I0ATerm<N-1>::value * (2.*N-1.) * (2.*N-1.) / (8. * N);
+};
+
+template <>
+struct I0ATerm<0> { // 1/sqrt(2*PI);
+    static const double value = 0.398942280401432677939946059934381868475858631164934657665925;
+};
+
+static inline double I0(double x) {
+    if (x < 3.75) { // TODO: Estrin's method instead of Horner's method?
+        x *= x;
+        return I0Term<0>::value + x*(
+                I0Term<1>::value + x*(
+                I0Term<2>::value + x*(
+                I0Term<3>::value + x*(
+                I0Term<4>::value + x*(
+                I0Term<5>::value + x*(
+                I0Term<6>::value)))))); // e < 1.6e-7
+    }
+    // a bit ugly here - perhaps we expand the top series
+    // to permit computation to x < 20 (a reasonable range)
+    double y = 1./x;
+    return exp(x) * sqrt(y) * (
+            // note: reciprocal squareroot may be easier!
+            // http://en.wikipedia.org/wiki/Fast_inverse_square_root
+            I0ATerm<0>::value + y*(
+            I0ATerm<1>::value + y*(
+            I0ATerm<2>::value + y*(
+            I0ATerm<3>::value + y*(
+            I0ATerm<4>::value + y*(
+            I0ATerm<5>::value + y*(
+            I0ATerm<6>::value + y*(
+            I0ATerm<7>::value + y*(
+            I0ATerm<8>::value))))))))); // (... e) < 1.9e-7
+}
+
+/*
+ * calculates the transition bandwidth for a Kaiser filter
+ *
+ * Formula 3.2.8, Multirate Systems and Filter Banks, PP Vaidyanathan, pg. 48
+ *
+ * @param halfNumCoef is half the number of coefficients per filter phase.
+ * @param stopBandAtten is the stop band attenuation desired.
+ * @return the transition bandwidth in normalized frequency (0 <= f <= 0.5)
+ */
+static inline double firKaiserTbw(int halfNumCoef, double stopBandAtten) {
+    return (stopBandAtten - 7.95)/(2.*14.36*halfNumCoef);
+}
+
+/*
+ * calculates the fir transfer response.
+ *
+ * calculates the transfer coefficient H(w) for 0 <= w <= PI.
+ * Be careful be careful to consider the fact that this is an interpolated filter
+ * of length L, so normalizing H(w)/L is probably what you expect.
+ */
+template <typename T>
+static inline double firTransfer(const T* coef, int L, int halfNumCoef, double w) {
+    double accum = static_cast<double>(coef[0])*0.5;
+    coef += halfNumCoef;    // skip first row.
+    for (int i=1 ; i<=L ; ++i) {
+        for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) {
+            accum += cos(ix*w)*static_cast<double>(*coef++);
+        }
+    }
+    return accum*2.;
+}
+
+/*
+ * returns the minimum and maximum |H(f)| bounds
+ *
+ * @param coef is the designed polyphase filter banks
+ *
+ * @param L is the number of phases (for interpolation)
+ *
+ * @param halfNumCoef should be half the number of coefficients for a single
+ * polyphase.
+ *
+ * @param fstart is the normalized frequency start.
+ *
+ * @param fend is the normalized frequency end.
+ *
+ * @param steps is the number of steps to take (sampling) between frequency start and end
+ *
+ * @param firMin returns the minimum transfer |H(f)| found
+ *
+ * @param firMax returns the maximum transfer |H(f)| found
+ *
+ * 0 <= f <= 0.5.
+ * This is used to test passband and stopband performance.
+ */
+template <typename T>
+static void testFir(const T* coef, int L, int halfNumCoef,
+        double fstart, double fend, int steps, double &firMin, double &firMax) {
+    double wstart = fstart*(2.*M_PI);
+    double wend = fend*(2.*M_PI);
+    double wstep = (wend - wstart)/steps;
+    double fmax, fmin;
+    double trf = firTransfer(coef, L, halfNumCoef, wstart);
+    if (trf<0) {
+        trf = -trf;
+    }
+    fmin = fmax = trf;
+    wstart += wstep;
+    for (int i=1; i<steps; ++i) {
+        trf = firTransfer(coef, L, halfNumCoef, wstart);
+        if (trf<0) {
+            trf = -trf;
+        }
+        if (trf>fmax) {
+            fmax = trf;
+        }
+        else if (trf<fmin) {
+            fmin = trf;
+        }
+        wstart += wstep;
+    }
+    // renormalize - this is only needed for integer filter types
+    double norm = 1./((1ULL<<(sizeof(T)*8-1))*L);
+
+    firMin = fmin * norm;
+    firMax = fmax * norm;
+}
+
+/*
+ * Calculates the polyphase filter banks based on a windowed sinc function.
+ *
+ * The windowed sinc is an odd length symmetric filter of exactly L*halfNumCoef*2+1
+ * taps for the entire kernel.  This is then decomposed into L+1 polyphase filterbanks.
+ * The last filterbank is used for interpolation purposes (and is mostly composed
+ * of the first bank shifted by one sample), and is unnecessary if one does
+ * not do interpolation.
+ *
+ * @param coef is the caller allocated space for coefficients.  This should be
+ * exactly (L+1)*halfNumCoef in size.
+ *
+ * @param L is the number of phases (for interpolation)
+ *
+ * @param halfNumCoef should be half the number of coefficients for a single
+ * polyphase.
+ *
+ * @param stopBandAtten is the stopband value, should be >50dB.
+ *
+ * @param fcr is cutoff frequency/sampling rate (<0.5).  At this point, the energy
+ * should be 6dB less. (fcr is where the amplitude drops by half).  Use the
+ * firKaiserTbw() to calculate the transition bandwidth.  fcr is the midpoint
+ * between the stop band and the pass band (fstop+fpass)/2.
+ *
+ * @param atten is the attenuation (generally slightly less than 1).
+ */
+
+template <typename T>
+static inline void firKaiserGen(T* coef, int L, int halfNumCoef,
+        double stopBandAtten, double fcr, double atten) {
+    //
+    // Formula 3.2.5, 3.2.7, Multirate Systems and Filter Banks, PP Vaidyanathan, pg. 48
+    //
+    // See also: http://melodi.ee.washington.edu/courses/ee518/notes/lec17.pdf
+    //
+    // Kaiser window and beta parameter
+    //
+    //         | 0.1102*(A - 8.7)                         A > 50
+    //  beta = | 0.5842*(A - 21)^0.4 + 0.07886*(A - 21)   21 <= A <= 50
+    //         | 0.                                       A < 21
+    //
+    // with A is the desired stop-band attenuation in dBFS
+    //
+    //    30 dB    2.210
+    //    40 dB    3.384
+    //    50 dB    4.538
+    //    60 dB    5.658
+    //    70 dB    6.764
+    //    80 dB    7.865
+    //    90 dB    8.960
+    //   100 dB   10.056
+
+    const int N = L * halfNumCoef; // non-negative half
+    const double beta = 0.1102 * (stopBandAtten - 8.7); // >= 50dB always
+    const double yscale = 2. * atten * fcr / I0(beta);
+    const double xstep = 2. * M_PI * fcr / L;
+    const double xfrac = 1. / N;
+    double err = 0; // for noise shaping on int16_t coefficients
+    for (int i=0 ; i<=L ; ++i) { // generate an extra set of coefs for interpolation
+        for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) {
+            double y = I0(beta * sqrt(1.0 - sqr(ix * xfrac))) * sinc(ix * xstep) * yscale;
+
+            // (caution!) float version does not need rounding
+            if (is_same<T, int16_t>::value) { // int16_t needs noise shaping
+                *coef++ = static_cast<T>(toint(y, 1ULL<<(sizeof(T)*8-1), err));
+            } else {
+                *coef++ = static_cast<T>(toint(y, 1ULL<<(sizeof(T)*8-1)));
+            }
+        }
+    }
+}
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_GEN_H*/
diff --git a/services/audioflinger/AudioResamplerFirOps.h b/services/audioflinger/AudioResamplerFirOps.h
new file mode 100644
index 0000000..bf2163f
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirOps.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_OPS_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_OPS_H
+
+namespace android {
+
+#if defined(__arm__) && !defined(__thumb__)
+#define USE_INLINE_ASSEMBLY (true)
+#else
+#define USE_INLINE_ASSEMBLY (false)
+#endif
+
+#if USE_INLINE_ASSEMBLY && defined(__ARM_NEON__)
+#define USE_NEON (true)
+#include <arm_neon.h>
+#else
+#define USE_NEON (false)
+#endif
+
+template<typename T, typename U>
+struct is_same
+{
+    static const bool value = false;
+};
+
+template<typename T>
+struct is_same<T, T>  // partial specialization
+{
+    static const bool value = true;
+};
+
+static inline
+int32_t mulRL(int left, int32_t in, uint32_t vRL)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    if (left) {
+        asm( "smultb %[out], %[in], %[vRL] \n"
+             : [out]"=r"(out)
+             : [in]"%r"(in), [vRL]"r"(vRL)
+             : );
+    } else {
+        asm( "smultt %[out], %[in], %[vRL] \n"
+             : [out]"=r"(out)
+             : [in]"%r"(in), [vRL]"r"(vRL)
+             : );
+    }
+    return out;
+#else
+    int16_t v = left ? static_cast<int16_t>(vRL) : static_cast<int16_t>(vRL>>16);
+    return static_cast<int32_t>((static_cast<int64_t>(in) * v) >> 16);
+#endif
+}
+
+static inline
+int32_t mulAdd(int16_t in, int16_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    asm( "smlabb %[out], %[v], %[in], %[a] \n"
+         : [out]"=r"(out)
+         : [in]"%r"(in), [v]"r"(v), [a]"r"(a)
+         : );
+    return out;
+#else
+    return a + v * in;
+#endif
+}
+
+static inline
+int32_t mulAdd(int16_t in, int32_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    asm( "smlawb %[out], %[v], %[in], %[a] \n"
+         : [out]"=r"(out)
+         : [in]"%r"(in), [v]"r"(v), [a]"r"(a)
+         : );
+    return out;
+#else
+    return a + static_cast<int32_t>((static_cast<int64_t>(v) * in) >> 16);
+#endif
+}
+
+static inline
+int32_t mulAdd(int32_t in, int32_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    asm( "smmla %[out], %[v], %[in], %[a] \n"
+         : [out]"=r"(out)
+         : [in]"%r"(in), [v]"r"(v), [a]"r"(a)
+         : );
+    return out;
+#else
+    return a + static_cast<int32_t>((static_cast<int64_t>(v) * in) >> 32);
+#endif
+}
+
+static inline
+int32_t mulAddRL(int left, uint32_t inRL, int16_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    if (left) {
+        asm( "smlabb %[out], %[v], %[inRL], %[a] \n"
+             : [out]"=r"(out)
+             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+             : );
+    } else {
+        asm( "smlabt %[out], %[v], %[inRL], %[a] \n"
+             : [out]"=r"(out)
+             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+             : );
+    }
+    return out;
+#else
+    int16_t s = left ? static_cast<int16_t>(inRL) : static_cast<int16_t>(inRL>>16);
+    return a + v * s;
+#endif
+}
+
+static inline
+int32_t mulAddRL(int left, uint32_t inRL, int32_t v, int32_t a)
+{
+#if USE_INLINE_ASSEMBLY
+    int32_t out;
+    if (left) {
+        asm( "smlawb %[out], %[v], %[inRL], %[a] \n"
+             : [out]"=r"(out)
+             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+             : );
+    } else {
+        asm( "smlawt %[out], %[v], %[inRL], %[a] \n"
+             : [out]"=r"(out)
+             : [inRL]"%r"(inRL), [v]"r"(v), [a]"r"(a)
+             : );
+    }
+    return out;
+#else
+    int16_t s = left ? static_cast<int16_t>(inRL) : static_cast<int16_t>(inRL>>16);
+    return a + static_cast<int32_t>((static_cast<int64_t>(v) * s) >> 16);
+#endif
+}
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_OPS_H*/
diff --git a/services/audioflinger/AudioResamplerFirProcess.h b/services/audioflinger/AudioResamplerFirProcess.h
new file mode 100644
index 0000000..38e387c
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirProcess.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H
+
+namespace android {
+
+// depends on AudioResamplerFirOps.h
+
+template<int CHANNELS, typename TC>
+static inline
+void mac(
+        int32_t& l, int32_t& r,
+        const TC coef,
+        const int16_t* samples)
+{
+    if (CHANNELS == 2) {
+        uint32_t rl = *reinterpret_cast<const uint32_t*>(samples);
+        l = mulAddRL(1, rl, coef, l);
+        r = mulAddRL(0, rl, coef, r);
+    } else {
+        r = l = mulAdd(samples[0], coef, l);
+    }
+}
+
+template<int CHANNELS, typename TC>
+static inline
+void interpolate(
+        int32_t& l, int32_t& r,
+        const TC coef_0, const TC coef_1,
+        const int16_t lerp, const int16_t* samples)
+{
+    TC sinc;
+
+    if (is_same<TC, int16_t>::value) {
+        sinc = (lerp * ((coef_1-coef_0)<<1)>>16) + coef_0;
+    } else {
+        sinc = mulAdd(lerp, (coef_1-coef_0)<<1, coef_0);
+    }
+    if (CHANNELS == 2) {
+        uint32_t rl = *reinterpret_cast<const uint32_t*>(samples);
+        l = mulAddRL(1, rl, sinc, l);
+        r = mulAddRL(0, rl, sinc, r);
+    } else {
+        r = l = mulAdd(samples[0], sinc, l);
+    }
+}
+
+/*
+ * Calculates a single output sample (two stereo frames).
+ *
+ * This function computes both the positive half FIR dot product and
+ * the negative half FIR dot product, accumulates, and then applies the volume.
+ *
+ * This is a locked phase filter (it does not compute the interpolation).
+ *
+ * Use fir() to compute the proper coefficient pointers for a polyphase
+ * filter bank.
+ */
+
+template <int CHANNELS, int STRIDE, typename TC>
+static inline
+void ProcessL(int32_t* const out,
+        int count,
+        const TC* coefsP,
+        const TC* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    int32_t l = 0;
+    int32_t r = 0;
+    do {
+        mac<CHANNELS>(l, r, *coefsP++, sP);
+        sP -= CHANNELS;
+        mac<CHANNELS>(l, r, *coefsN++, sN);
+        sN += CHANNELS;
+    } while (--count > 0);
+    out[0] += 2 * mulRL(0, l, volumeLR[0]); // Note: only use top 16b
+    out[1] += 2 * mulRL(0, r, volumeLR[1]); // Note: only use top 16b
+}
+
+/*
+ * Calculates a single output sample (two stereo frames) interpolating phase.
+ *
+ * This function computes both the positive half FIR dot product and
+ * the negative half FIR dot product, accumulates, and then applies the volume.
+ *
+ * This is an interpolated phase filter.
+ *
+ * Use fir() to compute the proper coefficient pointers for a polyphase
+ * filter bank.
+ */
+
+template <int CHANNELS, int STRIDE, typename TC>
+static inline
+void Process(int32_t* const out,
+        int count,
+        const TC* coefsP,
+        const TC* coefsN,
+        const TC* coefsP1,
+        const TC* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    (void) coefsP1; // suppress unused parameter warning
+    (void) coefsN1;
+    if (sizeof(*coefsP)==4) {
+        lerpP >>= 16;   // ensure lerpP is 16b
+    }
+    int32_t l = 0;
+    int32_t r = 0;
+    for (size_t i = 0; i < count; ++i) {
+        interpolate<CHANNELS>(l, r, coefsP[0], coefsP[count], lerpP, sP);
+        coefsP++;
+        sP -= CHANNELS;
+        interpolate<CHANNELS>(l, r, coefsN[count], coefsN[0], lerpP, sN);
+        coefsN++;
+        sN += CHANNELS;
+    }
+    out[0] += 2 * mulRL(0, l, volumeLR[0]); // Note: only use top 16b
+    out[1] += 2 * mulRL(0, r, volumeLR[1]); // Note: only use top 16b
+}
+
+/*
+ * Calculates a single output sample (two stereo frames) from input sample pointer.
+ *
+ * This sets up the params for the accelerated Process() and ProcessL()
+ * functions to do the appropriate dot products.
+ *
+ * @param out should point to the output buffer with at least enough space for 2 output frames.
+ *
+ * @param phase is the fractional distance between input samples for interpolation:
+ * phase >= 0  && phase < phaseWrapLimit.  It can be thought of as a rational fraction
+ * of phase/phaseWrapLimit.
+ *
+ * @param phaseWrapLimit is #polyphases<<coefShift, where #polyphases is the number of polyphases
+ * in the polyphase filter. Likewise, #polyphases can be obtained as (phaseWrapLimit>>coefShift).
+ *
+ * @param coefShift gives the bit alignment of the polyphase index in the phase parameter.
+ *
+ * @param halfNumCoefs is the half the number of coefficients per polyphase filter. Since the
+ * overall filterbank is odd-length symmetric, only halfNumCoefs need be stored.
+ *
+ * @param coefs is the polyphase filter bank, starting at from polyphase index 0, and ranging to
+ * and including the #polyphases.  Each polyphase of the filter has half-length halfNumCoefs
+ * (due to symmetry).  The total size of the filter bank in coefficients is
+ * (#polyphases+1)*halfNumCoefs.
+ *
+ * The filter bank coefs should be aligned to a minimum of 16 bytes (preferrably to cache line).
+ *
+ * The coefs should be attenuated (to compensate for passband ripple)
+ * if storing back into the native format.
+ *
+ * @param samples are unaligned input samples.  The position is in the "middle" of the
+ * sample array with respect to the FIR filter:
+ * the negative half of the filter is dot product from samples+1 to samples+halfNumCoefs;
+ * the positive half of the filter is dot product from samples to samples-halfNumCoefs+1.
+ *
+ * @param volumeLR is a pointer to an array of two 32 bit volume values, one per stereo channel,
+ * expressed as a S32 integer.  A negative value inverts the channel 180 degrees.
+ * The pointer volumeLR should be aligned to a minimum of 8 bytes.
+ * A typical value for volume is 0x1000 to align to a unity gain output of 20.12.
+ *
+ * In between calls to filterCoefficient, the phase is incremented by phaseIncrement, where
+ * phaseIncrement is calculated as inputSampling * phaseWrapLimit / outputSampling.
+ *
+ * The filter polyphase index is given by indexP = phase >> coefShift. Due to
+ * odd length symmetric filter, the polyphase index of the negative half depends on
+ * whether interpolation is used.
+ *
+ * The fractional siting between the polyphase indices is given by the bits below coefShift:
+ *
+ * lerpP = phase << 32 - coefShift >> 1;  // for 32 bit unsigned phase multiply
+ * lerpP = phase << 32 - coefShift >> 17; // for 16 bit unsigned phase multiply
+ *
+ * For integer types, this is expressed as:
+ *
+ * lerpP = phase << sizeof(phase)*8 - coefShift
+ *              >> (sizeof(phase)-sizeof(*coefs))*8 + 1;
+ *
+ */
+
+template<int CHANNELS, bool LOCKED, int STRIDE, typename TC>
+static inline
+void fir(int32_t* const out,
+        const uint32_t phase, const uint32_t phaseWrapLimit,
+        const int coefShift, const int halfNumCoefs, const TC* const coefs,
+        const int16_t* const samples, const int32_t* const volumeLR)
+{
+    // NOTE: be very careful when modifying the code here. register
+    // pressure is very high and a small change might cause the compiler
+    // to generate far less efficient code.
+    // Always sanity check the result with objdump or test-resample.
+
+    if (LOCKED) {
+        // locked polyphase (no interpolation)
+        // Compute the polyphase filter index on the positive and negative side.
+        uint32_t indexP = phase >> coefShift;
+        uint32_t indexN = (phaseWrapLimit - phase) >> coefShift;
+        const TC* coefsP = coefs + indexP*halfNumCoefs;
+        const TC* coefsN = coefs + indexN*halfNumCoefs;
+        const int16_t* sP = samples;
+        const int16_t* sN = samples + CHANNELS;
+
+        // dot product filter.
+        ProcessL<CHANNELS, STRIDE>(out,
+                halfNumCoefs, coefsP, coefsN, sP, sN, volumeLR);
+    } else {
+        // interpolated polyphase
+        // Compute the polyphase filter index on the positive and negative side.
+        uint32_t indexP = phase >> coefShift;
+        uint32_t indexN = (phaseWrapLimit - phase - 1) >> coefShift; // one's complement.
+        const TC* coefsP = coefs + indexP*halfNumCoefs;
+        const TC* coefsN = coefs + indexN*halfNumCoefs;
+        const TC* coefsP1 = coefsP + halfNumCoefs;
+        const TC* coefsN1 = coefsN + halfNumCoefs;
+        const int16_t* sP = samples;
+        const int16_t* sN = samples + CHANNELS;
+
+        // Interpolation fraction lerpP derived by shifting all the way up and down
+        // to clear the appropriate bits and align to the appropriate level
+        // for the integer multiply.  The constants should resolve in compile time.
+        //
+        // The interpolated filter coefficient is derived as follows for the pos/neg half:
+        //
+        // interpolated[P] = index[P]*lerpP + index[P+1]*(1-lerpP)
+        // interpolated[N] = index[N+1]*lerpP + index[N]*(1-lerpP)
+        uint32_t lerpP = phase << (sizeof(phase)*8 - coefShift)
+                >> ((sizeof(phase)-sizeof(*coefs))*8 + 1);
+
+        // on-the-fly interpolated dot product filter
+        Process<CHANNELS, STRIDE>(out,
+                halfNumCoefs, coefsP, coefsN, coefsP1, coefsN1, sP, sN, lerpP, volumeLR);
+    }
+}
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_H*/
diff --git a/services/audioflinger/AudioResamplerFirProcessNeon.h b/services/audioflinger/AudioResamplerFirProcessNeon.h
new file mode 100644
index 0000000..f311cef
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirProcessNeon.h
@@ -0,0 +1,1149 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
+
+namespace android {
+
+// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
+
+#if USE_NEON
+//
+// NEON specializations are enabled for Process() and ProcessL()
+//
+// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary)
+// and looping stride 16 (or vice versa). This has some polyphase coef data alignment
+// issues with S16 coefs. Consider this later.
+
+// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
+#define ASSEMBLY_ACCUMULATE_MONO \
+        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\
+        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\
+        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\
+        "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\
+        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\
+        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\
+        "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */
+
+#define ASSEMBLY_ACCUMULATE_STEREO \
+        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\
+        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\
+        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\
+        "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\
+        "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\
+        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\
+        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\
+        "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/
+
+template <>
+inline void ProcessL<1, 16>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
+
+        "1:                                      \n"
+
+        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
+        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
+        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
+
+        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef
+        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
+        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
+
+        // moving these ARM instructions before neon above seems to be slower
+        "subs           %[count], %[count], #8   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+         ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q10"
+    );
+}
+
+template <>
+inline void ProcessL<2, 16>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// (1) acc_L = 0
+        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
+
+        "1:                                      \n"
+
+        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
+        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
+        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
+        "vrev64.16      q3, q3                   \n"// (0 combines+) reverse right positive
+
+        "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left
+        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left
+        "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right
+        "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right
+        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
+        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
+        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
+        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
+
+        // moving these ARM before neon seems to be slower
+        "subs           %[count], %[count], #8   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q10"
+     );
+}
+
+template <>
+inline void Process<1, 16>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* coefsP1,
+        const int16_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
+        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
+
+        "1:                                      \n"
+
+        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
+        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
+        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
+        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
+
+        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
+        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
+
+        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
+        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
+
+        "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set
+        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
+
+        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef
+        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
+        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
+
+        // moving these ARM instructions before neon above seems to be slower
+        "subs           %[count], %[count], #8   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11"
+    );
+}
+
+template <>
+inline void Process<2, 16>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* coefsP1,
+        const int16_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
+        "veor           q0, q0, q0               \n"// (1) acc_L = 0
+        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
+
+        "1:                                      \n"
+
+        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
+        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
+        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
+        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
+        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
+
+        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
+        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
+
+        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
+        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
+        "vrev64.16      q3, q3                   \n"// (1) reverse 8 frames of the right positive
+
+        "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set
+        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
+
+        "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left
+        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left
+        "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right
+        "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right
+        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
+        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
+        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
+        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
+
+        // moving these ARM before neon seems to be slower
+        "subs           %[count], %[count], #8   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q9", "q10", "q11"
+    );
+}
+
+template <>
+inline void ProcessL<1, 16>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0                    \n"// result, initialize to 0
+
+        "1:                                           \n"
+
+        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
+        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
+        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
+        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
+
+        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
+
+        "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits
+        "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits
+
+        "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits
+        "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q0, q0, q15                   \n"// accumulate result
+        "vadd.s32       q0, q0, q13                   \n"// accumulate result
+
+        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
+        "subs           %[count], %[count], #8        \n"// update loop counter
+
+        "bne            1b                            \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void ProcessL<2, 16>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0                    \n"// result, initialize to 0
+        "veor           q4, q4, q4                    \n"// result, initialize to 0
+
+        "1:                                           \n"
+
+        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples
+        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples
+        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 4 32-bits coefs
+        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
+
+        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
+        "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side
+
+        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result
+        "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result
+
+        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q4, q4, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result
+        "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result
+
+        "subs           %[count], %[count], #8        \n"// update loop counter
+        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
+
+        "bne            1b                            \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void Process<1, 16>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int32_t* coefsP1,
+        const int32_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
+        "veor           q0, q0, q0                    \n"// result, initialize to 0
+
+        "1:                                           \n"
+
+        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
+        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
+        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
+        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
+        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
+        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
+
+        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
+        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
+        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
+        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
+
+        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
+
+        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
+        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
+        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
+        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
+
+        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
+
+        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q0, q0, q15                   \n"// accumulate result
+        "vadd.s32       q0, q0, q13                   \n"// accumulate result
+
+        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
+        "subs           %[count], %[count], #8        \n"// update loop counter
+
+        "bne            1b                            \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void Process<2, 16>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int32_t* coefsP1,
+        const int32_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 16;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
+        "veor           q0, q0, q0                    \n"// result, initialize to 0
+        "veor           q4, q4, q4                    \n"// result, initialize to 0
+
+        "1:                                           \n"
+
+        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples
+        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples
+        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
+        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
+        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
+        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
+
+        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
+        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
+        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
+        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
+
+        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
+        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
+
+        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
+        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
+        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
+        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
+
+        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
+        "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side
+
+        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result
+        "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result
+
+        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
+        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
+
+        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
+        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q4, q4, q12                   \n"// accumulate result
+        "vadd.s32       q13, q13, q14                 \n"// accumulate result
+        "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result
+        "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result
+
+        "subs           %[count], %[count], #8        \n"// update loop counter
+        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
+
+        "bne            1b                            \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void ProcessL<1, 8>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
+
+        "1:                                      \n"
+
+        "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
+        "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
+        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
+        "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs
+
+        "vrev64.16      d4, d4                   \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4
+
+        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed)samples by coef
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
+
+        // moving these ARM instructions before neon above seems to be slower
+        "subs           %[count], %[count], #4   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #8         \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q10"
+    );
+}
+
+template <>
+inline void ProcessL<2, 8>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// (1) acc_L = 0
+        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
+
+        "1:                                      \n"
+
+        "vld2.16        {d4, d5}, [%[sP]]        \n"// (2+0d) load 8 16-bits stereo samples
+        "vld2.16        {d6, d7}, [%[sN]]!       \n"// (2) load 8 16-bits stereo samples
+        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
+
+        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
+        "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
+        "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
+
+        // moving these ARM before neon seems to be slower
+        "subs           %[count], %[count], #4   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q10"
+     );
+}
+
+template <>
+inline void Process<1, 8>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* coefsP1,
+        const int16_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
+        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
+
+        "1:                                      \n"
+
+        "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
+        "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
+        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
+        "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
+        "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 4 16-bits coefs
+        "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
+
+        "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
+        "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
+
+        "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
+        "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
+
+        "vrev64.16      d4, d4                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
+
+        "vadd.s16       d16, d16, d17            \n"// (1+2d) interpolate (step3) 1st set
+        "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
+
+        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
+        "vmlal.s16      q0, d4, d16              \n"// (1+0d) multiply (reversed)by coef
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
+
+        // moving these ARM instructions before neon above seems to be slower
+        "subs           %[count], %[count], #4   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #8        \n"// move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11"
+    );
+}
+
+template <>
+inline void Process<2, 8>(int32_t* const out,
+        int count,
+        const int16_t* coefsP,
+        const int16_t* coefsN,
+        const int16_t* coefsP1,
+        const int16_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
+        "veor           q0, q0, q0               \n"// (1) acc_L = 0
+        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
+
+        "1:                                      \n"
+
+        "vld2.16        {d4, d5}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
+        "vld2.16        {d6, d7}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
+        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
+        "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 8 16-bits coefs
+        "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
+
+        "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
+        "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
+
+        "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
+        "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
+
+        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
+
+        "vadd.s16       d16, d16, d17            \n"// (1+1d) interpolate (step3) 1st set
+        "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
+
+        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
+        "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
+        "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
+        "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
+
+        // moving these ARM before neon seems to be slower
+        "subs           %[count], %[count], #4   \n"// (1) update loop counter
+        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
+
+        // sP used after branch (warning)
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN1] "+r" (coefsN1),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q4", "q5", "q6",
+          "q8", "q9", "q10", "q11"
+    );
+}
+
+template <>
+inline void ProcessL<1, 8>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// result, initialize to 0
+
+        "1:                                      \n"
+
+        "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
+        "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
+        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
+        "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
+
+        "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
+
+        "vshll.s16      q12, d4, #15             \n"// (stall) extend samples to 31 bits
+        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12              \n"// accumulate result
+        "vadd.s32       q0, q0, q14              \n"// (stall) accumulate result
+
+        "subs           %[count], %[count], #4   \n"// update loop counter
+        "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
+
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out] "=Uv" (out[0]),
+          [count] "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP] "+r" (sP),
+          [sN] "+r" (sN)
+        : [vLR] "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11",
+          "q12", "q14"
+    );
+}
+
+template <>
+inline void ProcessL<2, 8>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int16_t* sP,
+        const int16_t* sN,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "veor           q0, q0, q0               \n"// result, initialize to 0
+        "veor           q4, q4, q4               \n"// result, initialize to 0
+
+        "1:                                      \n"
+
+        "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
+        "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
+        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
+        "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
+
+        "vrev64.16      q2, q2                   \n"// reverse 2 frames of the positive side
+
+        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
+        "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
+
+        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
+        "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by coef
+        "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by coef
+        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by coef
+        "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by coef
+
+        "vadd.s32       q0, q0, q12              \n"// accumulate result
+        "vadd.s32       q4, q4, q13              \n"// accumulate result
+        "vadd.s32       q0, q0, q14              \n"// accumulate result
+        "vadd.s32       q4, q4, q15              \n"// accumulate result
+
+        "subs           %[count], %[count], #4   \n"// update loop counter
+        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
+
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsN0] "+r" (coefsN),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3", "q4",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+template <>
+inline void Process<1, 8>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int32_t* coefsP1,
+        const int32_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 1; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
+        "veor           q0, q0, q0               \n"// result, initialize to 0
+
+        "1:                                      \n"
+
+        "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
+        "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
+        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
+        "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
+        "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
+        "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
+
+        "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
+
+        "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
+        "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
+        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
+        "vqrdmulh.s32   q11, q11, d2[0]          \n"// interpolate (step2) 2nd set of coefs
+        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
+
+        "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
+        "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
+
+        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12              \n"// accumulate result
+        "vadd.s32       q0, q0, q14              \n"// accumulate result
+
+        "subs           %[count], %[count], #4   \n"// update loop counter
+        "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
+
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_MONO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN0] "+r" (coefsN),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3",
+          "q8", "q9", "q10", "q11",
+          "q12", "q14"
+    );
+}
+
+template <>
+inline
+void Process<2, 8>(int32_t* const out,
+        int count,
+        const int32_t* coefsP,
+        const int32_t* coefsN,
+        const int32_t* coefsP1,
+        const int32_t* coefsN1,
+        const int16_t* sP,
+        const int16_t* sN,
+        uint32_t lerpP,
+        const int32_t* const volumeLR)
+{
+    const int CHANNELS = 2; // template specialization does not preserve params
+    const int STRIDE = 8;
+    sP -= CHANNELS*((STRIDE>>1)-1);
+    asm (
+        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
+        "veor           q0, q0, q0               \n"// result, initialize to 0
+        "veor           q4, q4, q4               \n"// result, initialize to 0
+
+        "1:                                      \n"
+        "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
+        "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
+        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
+        "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
+        "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
+        "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
+
+        "vrev64.16      q2, q2                   \n"// (reversed) 2 frames of the positive side
+
+        "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
+        "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
+        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
+        "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
+
+        "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
+        "vqrdmulh.s32   q11, q11, d2[1]          \n"// interpolate (step3) 2nd set of coefs
+        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
+        "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
+
+        "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
+        "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
+
+        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
+        "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by interpolated coef
+
+        "vadd.s32       q0, q0, q12              \n"// accumulate result
+        "vadd.s32       q4, q4, q13              \n"// accumulate result
+        "vadd.s32       q0, q0, q14              \n"// accumulate result
+        "vadd.s32       q4, q4, q15              \n"// accumulate result
+
+        "subs           %[count], %[count], #4   \n"// update loop counter
+        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
+
+        "bne            1b                       \n"// loop
+
+        ASSEMBLY_ACCUMULATE_STEREO
+
+        : [out]     "=Uv" (out[0]),
+          [count]   "+r" (count),
+          [coefsP0] "+r" (coefsP),
+          [coefsP1] "+r" (coefsP1),
+          [coefsN0] "+r" (coefsN),
+          [coefsN1] "+r" (coefsN1),
+          [sP]      "+r" (sP),
+          [sN]      "+r" (sN)
+        : [lerpP]   "r" (lerpP),
+          [vLR]     "r" (volumeLR)
+        : "cc", "memory",
+          "q0", "q1", "q2", "q3", "q4",
+          "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15"
+    );
+}
+
+#endif //USE_NEON
+
+}; // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
diff --git a/services/audioflinger/test-resample.cpp b/services/audioflinger/test-resample.cpp
index 403bb6d..3f2ce55 100644
--- a/services/audioflinger/test-resample.cpp
+++ b/services/audioflinger/test-resample.cpp
@@ -33,8 +33,9 @@
 bool gVerbose = false;
 
 static int usage(const char* name) {
-    fprintf(stderr,"Usage: %s [-p] [-h] [-v] [-s] [-q {dq|lq|mq|hq|vhq}] [-i input-sample-rate] "
-                   "[-o output-sample-rate] [<input-file>] <output-file>\n", name);
+    fprintf(stderr,"Usage: %s [-p] [-h] [-v] [-s] [-q {dq|lq|mq|hq|vhq|dlq|dmq|dhq}]"
+                   " [-i input-sample-rate] [-o output-sample-rate] [<input-file>]"
+                   " <output-file>\n", name);
     fprintf(stderr,"    -p    enable profiling\n");
     fprintf(stderr,"    -h    create wav file\n");
     fprintf(stderr,"    -v    verbose : log buffer provider calls\n");
@@ -45,6 +46,9 @@
     fprintf(stderr,"              mq  : medium quality\n");
     fprintf(stderr,"              hq  : high quality\n");
     fprintf(stderr,"              vhq : very high quality\n");
+    fprintf(stderr,"              dlq : dynamic low quality\n");
+    fprintf(stderr,"              dmq : dynamic medium quality\n");
+    fprintf(stderr,"              dhq : dynamic high quality\n");
     fprintf(stderr,"    -i    input file sample rate (ignored if input file is specified)\n");
     fprintf(stderr,"    -o    output file sample rate\n");
     return -1;
@@ -86,6 +90,12 @@
                 quality = AudioResampler::HIGH_QUALITY;
             else if (!strcmp(optarg, "vhq"))
                 quality = AudioResampler::VERY_HIGH_QUALITY;
+            else if (!strcmp(optarg, "dlq"))
+                quality = AudioResampler::DYN_LOW_QUALITY;
+            else if (!strcmp(optarg, "dmq"))
+                quality = AudioResampler::DYN_MED_QUALITY;
+            else if (!strcmp(optarg, "dhq"))
+                quality = AudioResampler::DYN_HIGH_QUALITY;
             else {
                 usage(progname);
                 return -1;
@@ -137,6 +147,8 @@
         channels = info.channels;
         input_freq = info.samplerate;
     } else {
+        // data for testing is exactly (input sampling rate/1000)/2 seconds
+        // so 44.1khz input is 22.05 seconds
         double k = 1000; // Hz / s
         double time = (input_freq / 2) / k;
         size_t input_frames = size_t(input_freq * time);
@@ -148,7 +160,7 @@
             double y = sin(M_PI * k * t * t);
             int16_t yi = floor(y * 32767.0 + 0.5);
             for (size_t j=0 ; j<(size_t)channels ; j++) {
-                in[i*channels + j] = yi / (1+j);
+                in[i*channels + j] = yi / (1+j); // right ch. 1/2 left ch.
             }
         }
     }
@@ -170,6 +182,7 @@
         }
         virtual status_t getNextBuffer(Buffer* buffer,
                 int64_t pts = kInvalidPTS) {
+            (void)pts; // suppress warning
             size_t requestedFrames = buffer->frameCount;
             if (requestedFrames > mNumFrames - mNextFrame) {
                 buffer->frameCount = mNumFrames - mNextFrame;
@@ -205,6 +218,9 @@
             buffer->frameCount = 0;
             buffer->i16 = NULL;
         }
+        void reset() {
+            mNextFrame = 0;
+        }
     } provider(input_vaddr, input_size, channels);
 
     size_t input_frames = input_size / (channels * sizeof(int16_t));
@@ -215,37 +231,29 @@
     output_size &= ~7; // always stereo, 32-bits
 
     void* output_vaddr = malloc(output_size);
-
-    if (profiling) {
-        AudioResampler* resampler = AudioResampler::create(16, channels,
-                output_freq, quality);
-
-        size_t out_frames = output_size/8;
-        resampler->setSampleRate(input_freq);
-        resampler->setVolume(0x1000, 0x1000);
-
-        memset(output_vaddr, 0, output_size);
-        timespec start, end;
-        clock_gettime(CLOCK_MONOTONIC, &start);
-        resampler->resample((int*) output_vaddr, out_frames, &provider);
-        resampler->resample((int*) output_vaddr, out_frames, &provider);
-        resampler->resample((int*) output_vaddr, out_frames, &provider);
-        resampler->resample((int*) output_vaddr, out_frames, &provider);
-        clock_gettime(CLOCK_MONOTONIC, &end);
-        int64_t start_ns = start.tv_sec * 1000000000LL + start.tv_nsec;
-        int64_t end_ns = end.tv_sec * 1000000000LL + end.tv_nsec;
-        int64_t time = (end_ns - start_ns)/4;
-        printf("%f Mspl/s\n", out_frames/(time/1e9)/1e6);
-
-        delete resampler;
-    }
-
     AudioResampler* resampler = AudioResampler::create(16, channels,
             output_freq, quality);
     size_t out_frames = output_size/8;
     resampler->setSampleRate(input_freq);
     resampler->setVolume(0x1000, 0x1000);
 
+    if (profiling) {
+        const int looplimit = 100;
+        timespec start, end;
+        clock_gettime(CLOCK_MONOTONIC, &start);
+        for (int i = 0; i < looplimit; ++i) {
+            resampler->resample((int*) output_vaddr, out_frames, &provider);
+            provider.reset(); //  reset only provider as benchmarking
+        }
+        clock_gettime(CLOCK_MONOTONIC, &end);
+        int64_t start_ns = start.tv_sec * 1000000000LL + start.tv_nsec;
+        int64_t end_ns = end.tv_sec * 1000000000LL + end.tv_nsec;
+        int64_t time = end_ns - start_ns;
+        printf("time(ns):%lld  channels:%d  quality:%d\n", time, channels, quality);
+        printf("%f Mspl/s\n", out_frames * looplimit / (time / 1e9) / 1e6);
+        resampler->reset();
+    }
+
     memset(output_vaddr, 0, output_size);
     if (gVerbose) {
         printf("resample() %u output frames\n", out_frames);
@@ -259,14 +267,18 @@
         printf("reset() complete\n");
     }
 
-    // down-mix (we just truncate and keep the left channel)
+    // mono takes left channel only
+    // stereo right channel is half amplitude of stereo left channel (due to input creation)
     int32_t* out = (int32_t*) output_vaddr;
     int16_t* convert = (int16_t*) malloc(out_frames * channels * sizeof(int16_t));
+
     for (size_t i = 0; i < out_frames; i++) {
-        for (int j=0 ; j<channels ; j++) {
+        for (int j = 0; j < channels; j++) {
             int32_t s = out[i * 2 + j] >> 12;
-            if (s > 32767)       s =  32767;
-            else if (s < -32768) s = -32768;
+            if (s > 32767)
+                s = 32767;
+            else if (s < -32768)
+                s = -32768;
             convert[i * channels + j] = int16_t(s);
         }
     }