- Bring speex codec up to date with their SVN trunk
- Speex codec should work in FIXED_POINT mode when PJ_HAS_FLOATING_POINT is set to zero.
- ulaw2linear will return zero if zero is given (this would make the VAD works better, and it also fixed click noise when call is established/hangup).




git-svn-id: https://svn.pjsip.org/repos/pjproject/trunk@628 74dad513-b988-da41-8d7b-12977e46ad98
diff --git a/pjmedia/src/pjmedia-codec/speex/_kiss_fft_guts.h b/pjmedia/src/pjmedia-codec/speex/_kiss_fft_guts.h
index abbd8b1..72acee1 100644
--- a/pjmedia/src/pjmedia-codec/speex/_kiss_fft_guts.h
+++ b/pjmedia/src/pjmedia-codec/speex/_kiss_fft_guts.h
@@ -20,7 +20,6 @@
    and defines
    typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; }kiss_fft_cpx; */
 #include "kiss_fft.h"
-#include <limits.h>
 
 #define MAXFACTORS 32
 /* e.g. an fft of length 128 has 4 factors 
@@ -45,8 +44,9 @@
    C_ADDTO( res , a)    : res += a
  * */
 #ifdef FIXED_POINT
+#include "misc.h"
 # define FRACBITS 15
-# define SAMPPROD int32_t 
+# define SAMPPROD spx_int32_t 
 #define SAMP_MAX 32767
 
 #define SAMP_MIN -SAMP_MAX
diff --git a/pjmedia/src/pjmedia-codec/speex/arch.h b/pjmedia/src/pjmedia-codec/speex/arch.h
index 5206619..0500437 100644
--- a/pjmedia/src/pjmedia-codec/speex/arch.h
+++ b/pjmedia/src/pjmedia-codec/speex/arch.h
@@ -41,18 +41,12 @@
 #define ABS16(x) ((x) < 0 ? (-(x)) : (x))    /**< Absolute 16-bit value.  */
 #define MAX16(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 16-bit value.   */
 #define ABS32(x) ((x) < 0 ? (-(x)) : (x))    /**< Absolute 32-bit value.  */
+#define MAX32(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 32-bit value.   */
 
 #ifdef FIXED_POINT
 
 typedef spx_int16_t spx_word16_t;
 typedef spx_int32_t   spx_word32_t;
-#ifdef _MSC_VER
-typedef __int64      spx_word64_t;
-#elif defined NO_LONGLONG
-typedef double    spx_word64_t;
-#else
-typedef long long    spx_word64_t;
-#endif
 typedef spx_word32_t spx_mem_t;
 typedef spx_word16_t spx_coef_t;
 typedef spx_word16_t spx_lsp_t;
@@ -103,7 +97,6 @@
 typedef float spx_sig_t;
 typedef float spx_word16_t;
 typedef float spx_word32_t;
-typedef float spx_word64_t;
 
 #define Q15ONE 1.0f
 #define LPC_SCALING  1.f
@@ -146,7 +139,6 @@
 #define SUB16(a,b) ((a)-(b))
 #define ADD32(a,b) ((a)+(b))
 #define SUB32(a,b) ((a)-(b))
-#define ADD64(a,b) ((a)+(b))
 #define MULT16_16_16(a,b)     ((a)*(b))
 #define MULT16_16(a,b)     ((spx_word32_t)(a)*(spx_word32_t)(b))
 #define MAC16_16(c,a,b)     ((c)+(spx_word32_t)(a)*(spx_word32_t)(b))
@@ -161,20 +153,25 @@
 
 #define MAC16_16_Q11(c,a,b)     ((c)+(a)*(b))
 #define MAC16_16_Q13(c,a,b)     ((c)+(a)*(b))
+#define MAC16_16_P13(c,a,b)     ((c)+(a)*(b))
 #define MULT16_16_Q11_32(a,b)     ((a)*(b))
 #define MULT16_16_Q13(a,b)     ((a)*(b))
 #define MULT16_16_Q14(a,b)     ((a)*(b))
 #define MULT16_16_Q15(a,b)     ((a)*(b))
 #define MULT16_16_P15(a,b)     ((a)*(b))
+#define MULT16_16_P13(a,b)     ((a)*(b))
+#define MULT16_16_P14(a,b)     ((a)*(b))
 
-#define DIV32_16(a,b)     ((a)/(b))
-#define DIV32(a,b)     ((a)/(b))
+#define DIV32_16(a,b)     (((spx_word32_t)(a))/(spx_word16_t)(b))
+#define PDIV32_16(a,b)     (((spx_word32_t)(a))/(spx_word16_t)(b))
+#define DIV32(a,b)     (((spx_word32_t)(a))/(spx_word32_t)(b))
+#define PDIV32(a,b)     (((spx_word32_t)(a))/(spx_word32_t)(b))
 
 
 #endif
 
 
-#ifdef CONFIG_TI_C55X
+#if defined (CONFIG_TI_C54X) || defined (CONFIG_TI_C55X)
 
 /* 2 on TI C5x DSP */
 #define BYTES_PER_CHAR 2 
diff --git a/pjmedia/src/pjmedia-codec/speex/bits.c b/pjmedia/src/pjmedia-codec/speex/bits.c
index fae7a9e..376e804 100644
--- a/pjmedia/src/pjmedia-codec/speex/bits.c
+++ b/pjmedia/src/pjmedia-codec/speex/bits.c
@@ -93,28 +93,36 @@
 void speex_bits_read_from(SpeexBits *bits, char *chars, int len)
 {
    int i;
-   if (len > bits->buf_size)
+   int nchars = len / BYTES_PER_CHAR;
+   if (nchars > bits->buf_size)
    {
       speex_warning_int("Packet is larger than allocated buffer: ", len);
       if (bits->owner)
       {
-         char *tmp = (char*)speex_realloc(bits->chars, len);
+         char *tmp = (char*)speex_realloc(bits->chars, nchars);
          if (tmp)
          {
-            bits->buf_size=len;
+            bits->buf_size=nchars;
             bits->chars=tmp;
          } else {
-            len=bits->buf_size;
+            nchars=bits->buf_size;
             speex_warning("Could not resize input buffer: truncating input");
          }
       } else {
          speex_warning("Do not own input buffer: truncating input");
-         len=bits->buf_size;
+         nchars=bits->buf_size;
       }
    }
-   for (i=0;i<len;i++)
-      bits->chars[i]=chars[i];
-   bits->nbBits=len<<3;
+#if (BYTES_PER_CHAR==2)
+/* Swap bytes to proper endian order (could be done externally) */
+#define HTOLS(A) ((((A) >> 8)&0xff)|(((A) & 0xff)<<8))
+#else
+#define HTOLS(A) (A)
+#endif
+   for (i=0;i<nchars;i++)
+      bits->chars[i]=HTOLS(chars[i]);
+
+   bits->nbBits=nchars<<LOG2_BITS_PER_CHAR;
    bits->charPtr=0;
    bits->bitPtr=0;
    bits->overflow=0;
@@ -161,7 +169,7 @@
    speex_bits_flush(bits);
    pos=bits->nbBits>>LOG2_BITS_PER_CHAR;
    for (i=0;i<nchars;i++)
-      bits->chars[pos+i]=chars[i];
+      bits->chars[pos+i]=HTOLS(chars[i]);
    bits->nbBits+=nchars<<LOG2_BITS_PER_CHAR;
 }
 
@@ -182,11 +190,7 @@
 
    if (max_nchars > ((bits->nbBits+BITS_PER_CHAR-1)>>LOG2_BITS_PER_CHAR))
       max_nchars = ((bits->nbBits+BITS_PER_CHAR-1)>>LOG2_BITS_PER_CHAR);
-#if BYTES_PER_CHAR==1
-#define HTOLS(A) (A)
-#else
-#define HTOLS(A) ((((A) >> 8)&0xff)|(((A) & 0xff)<<8))
-#endif
+
    for (i=0;i<max_nchars;i++)
       chars[i]=HTOLS(bits->chars[i]);
    return max_nchars*BYTES_PER_CHAR;
@@ -199,8 +203,8 @@
    if (max_nchars > ((bits->nbBits)>>LOG2_BITS_PER_CHAR))
       max_nchars = ((bits->nbBits)>>LOG2_BITS_PER_CHAR);
    for (i=0;i<max_nchars;i++)
-      chars[i]=bits->chars[i];
-   
+      chars[i]=HTOLS(bits->chars[i]);
+
    if (bits->bitPtr>0)
       bits->chars[0]=bits->chars[max_nchars];
    else
diff --git a/pjmedia/src/pjmedia-codec/speex/cb_search.c b/pjmedia/src/pjmedia-codec/speex/cb_search.c
index 234dc2e..b4a223b 100644
--- a/pjmedia/src/pjmedia-codec/speex/cb_search.c
+++ b/pjmedia/src/pjmedia-codec/speex/cb_search.c
@@ -70,7 +70,7 @@
          for (k=0;k<=j;k++)
             resj = MAC16_16(resj,shape[k],r[j-k]);
 #ifdef FIXED_POINT
-         res16 = EXTRACT16(SHR32(resj, 11));
+         res16 = EXTRACT16(SHR32(resj, 13));
 #else
          res16 = 0.03125f*resj;
 #endif
@@ -88,16 +88,15 @@
 static inline void target_update(spx_word16_t *t, spx_word16_t g, spx_word16_t *r, int len)
 {
    int n;
-   int q=0;
-   for (n=0;n<len;n++,q++)
-      t[n] = SUB32(t[n],MULT16_16_Q11_32(g,r[q]));
+   for (n=0;n<len;n++)
+      t[n] = SUB16(t[n],PSHR32(MULT16_16(g,r[n]),13));
 }
 #endif
 
 
 
 static void split_cb_search_shape_sign_N1(
-spx_sig_t target[],			/* target vector */
+spx_word16_t target[],			/* target vector */
 spx_coef_t ak[],			/* LPCs for this subframe */
 spx_coef_t awk1[],			/* Weighted LPCs for this subframe */
 spx_coef_t awk2[],			/* Weighted LPCs for this subframe */
@@ -113,9 +112,6 @@
 )
 {
    int i,j,m,q;
-#ifndef FIXED_POINT
-   int n;
-#endif
    VARDECL(spx_word16_t *resp);
 #ifdef _USE_SSE
    VARDECL(__m128 *resp2);
@@ -158,7 +154,7 @@
    
    /* FIXME: make that adaptive? */
    for (i=0;i<nsf;i++)
-      t[i]=EXTRACT16(PSHR32(target[i],6));
+      t[i]=target[i];
 
    compute_weighted_codebook(shape_cb, r, resp, resp2, E, shape_cb_size, subvect_size, stack);
 
@@ -222,13 +218,10 @@
          q=subvect_size-m;
 #ifdef FIXED_POINT
          g=sign*shape_cb[rind*subvect_size+m];
-         target_update(t+subvect_size*(i+1), g, r+q, nsf-subvect_size*(i+1));
 #else
          g=sign*0.03125*shape_cb[rind*subvect_size+m];
-         /*FIXME: I think that one too can be replaced by target_update */
-         for (n=subvect_size*(i+1);n<nsf;n++,q++)
-            t[n] = SUB32(t[n],g*r[q]);
 #endif
+         target_update(t+subvect_size*(i+1), g, r+q, nsf-subvect_size*(i+1));
       }
    }
 
@@ -244,14 +237,14 @@
       ALLOC(r2, nsf, spx_sig_t);
       syn_percep_zero(e, ak, awk1, awk2, r2, nsf,p, stack);
       for (j=0;j<nsf;j++)
-         target[j]=SUB32(target[j],r2[j]);
+         target[j]=SUB16(target[j],EXTRACT16(PSHR32(r2[j],8)));
    }
 }
 
 
 
 void split_cb_search_shape_sign(
-spx_sig_t target[],			/* target vector */
+spx_word16_t target[],			/* target vector */
 spx_coef_t ak[],			/* LPCs for this subframe */
 spx_coef_t awk1[],			/* Weighted LPCs for this subframe */
 spx_coef_t awk2[],			/* Weighted LPCs for this subframe */
@@ -356,7 +349,7 @@
    
    /* FIXME: make that adaptive? */
    for (i=0;i<nsf;i++)
-      t[i]=EXTRACT16(PSHR32(target[i],6));
+      t[i]=target[i];
 
    for (j=0;j<N;j++)
       speex_move(&ot[j][0], t, nsf*sizeof(spx_word16_t));
@@ -444,13 +437,10 @@
             q=subvect_size-m;
 #ifdef FIXED_POINT
             g=sign*shape_cb[rind*subvect_size+m];
-            target_update(nt[j]+subvect_size*(i+1), g, r+q, nsf-subvect_size*(i+1));
 #else
             g=sign*0.03125*shape_cb[rind*subvect_size+m];
-            /*FIXME: I think that one too can be replaced by target_update */
-            for (n=subvect_size*(i+1);n<nsf;n++,q++)
-               nt[j][n] = SUB32(nt[j][n],g*r[q]);
 #endif
+            target_update(nt[j]+subvect_size*(i+1), g, r+q, nsf-subvect_size*(i+1));
          }
 
          for (q=0;q<nb_subvect;q++)
@@ -514,7 +504,7 @@
    {
       syn_percep_zero(e, ak, awk1, awk2, r2, nsf,p, stack);
       for (j=0;j<nsf;j++)
-         target[j]=SUB32(target[j],r2[j]);
+         target[j]=SUB16(target[j],EXTRACT16(PSHR32(r2[j],8)));
    }
 }
 
@@ -577,7 +567,7 @@
 }
 
 void noise_codebook_quant(
-spx_sig_t target[],			/* target vector */
+spx_word16_t target[],			/* target vector */
 spx_coef_t ak[],			/* LPCs for this subframe */
 spx_coef_t awk1[],			/* Weighted LPCs for this subframe */
 spx_coef_t awk2[],			/* Weighted LPCs for this subframe */
@@ -595,13 +585,14 @@
    int i;
    VARDECL(spx_sig_t *tmp);
    ALLOC(tmp, nsf, spx_sig_t);
-   residue_percep_zero(target, ak, awk1, awk2, tmp, nsf, p, stack);
+   for (i=0;i<nsf;i++)
+      tmp[i]=PSHR32(EXTEND32(target[i]),SIG_SHIFT);
+   residue_percep_zero(tmp, ak, awk1, awk2, tmp, nsf, p, stack);
 
    for (i=0;i<nsf;i++)
       exc[i]+=tmp[i];
    for (i=0;i<nsf;i++)
       target[i]=0;
-
 }
 
 
@@ -613,5 +604,9 @@
 char *stack
 )
 {
-   speex_rand_vec(1, exc, nsf);
+   int i;
+   /* FIXME: This is bad, but I don't think the function ever gets called anyway */
+   spx_int32_t seed = 0;
+   for (i=0;i<nsf;i++)
+      exc[i]=SHL32(EXTEND32(speex_rand(1, &seed)),SIG_SHIFT);
 }
diff --git a/pjmedia/src/pjmedia-codec/speex/cb_search.h b/pjmedia/src/pjmedia-codec/speex/cb_search.h
index 38ac077..ea8816d 100644
--- a/pjmedia/src/pjmedia-codec/speex/cb_search.h
+++ b/pjmedia/src/pjmedia-codec/speex/cb_search.h
@@ -49,7 +49,7 @@
 
 
 void split_cb_search_shape_sign(
-spx_sig_t target[],             /* target vector */
+spx_word16_t target[],             /* target vector */
 spx_coef_t ak[],                /* LPCs for this subframe */
 spx_coef_t awk1[],              /* Weighted LPCs for this subframe */
 spx_coef_t awk2[],              /* Weighted LPCs for this subframe */
@@ -74,7 +74,7 @@
 
 
 void noise_codebook_quant(
-spx_sig_t target[],             /* target vector */
+spx_word16_t target[],             /* target vector */
 spx_coef_t ak[],                /* LPCs for this subframe */
 spx_coef_t awk1[],              /* Weighted LPCs for this subframe */
 spx_coef_t awk2[],              /* Weighted LPCs for this subframe */
diff --git a/pjmedia/src/pjmedia-codec/speex/cb_search_bfin.h b/pjmedia/src/pjmedia-codec/speex/cb_search_bfin.h
index 7d476a3..52cc4b3 100644
--- a/pjmedia/src/pjmedia-codec/speex/cb_search_bfin.h
+++ b/pjmedia/src/pjmedia-codec/speex/cb_search_bfin.h
@@ -61,7 +61,7 @@
                "A0 += R0.L*R1.L (IS) || R0 = B[P4++] (X) || R1.L = W[I1--];\n\t"
             "LOOP_END inner%=;\n\t"
             "R0 = A0;\n\t"
-            "R0 >>>= 11;\n\t"
+            "R0 >>>= 13;\n\t"
             "A1 += R0.L*R0.L (IS);\n\t"
             "W[P3++] = R0;\n\t"
             "P0 += 1;\n\t"
@@ -72,7 +72,8 @@
          "[P4] = R1;\n\t"
          :
       : "m" (subvect_size), "m" (shape_cb), "m" (r), "m" (resp), "m" (E)
-      : "A0", "P0", "P1", "P2", "P3", "P4", "R0", "R1", "R2", "I0", "I1", "L0", "L1", "A0", "A1", "memory"
+      : "A0", "P0", "P1", "P2", "P3", "P4", "R0", "R1", "R2", "I0", "I1", "L0", 
+        "L1", "A0", "A1", "memory", "LC0", "LC1"
       );
       shape_cb += subvect_size;
       resp += subvect_size;
@@ -83,23 +84,26 @@
 #define OVERRIDE_TARGET_UPDATE
 static inline void target_update(spx_word16_t *t, spx_word16_t g, spx_word16_t *r, int len)
 {
+   if (!len)
+      return;
    __asm__ __volatile__
          (
          "I0 = %0;\n\t"
          "I1 = %1;\n\t"
          "L0 = 0;\n\t"
          "L1 = 0;\n\t"
+         "R2 = 4096;\n\t"
          "LOOP tupdate%= LC0 = %3;\n\t"
          "LOOP_BEGIN tupdate%=;\n\t"
             "R0.L = W[I0] || R1.L = W[I1++];\n\t"
             "R1 = (A1 = R1.L*%2.L) (IS);\n\t"
-            "R1 >>>= 11;\n\t"
+            "R1 = R1 + R2;\n\t"
+            "R1 >>>= 13;\n\t"
             "R0.L = R0.L - R1.L;\n\t"
             "W[I0++] = R0.L;\n\t"
          "LOOP_END tupdate%=;\n\t"
    :
    : "a" (t), "a" (r), "d" (g), "a" (len)
-   : "R0", "R1", "A1", "I0", "I1", "L0", "L1"
+   : "R0", "R1", "R2", "A1", "I0", "I1", "L0", "L1"
          );
 }
-
diff --git a/pjmedia/src/pjmedia-codec/speex/config.h b/pjmedia/src/pjmedia-codec/speex/config.h
index ef3a4a0..6ab2235 100644
--- a/pjmedia/src/pjmedia-codec/speex/config.h
+++ b/pjmedia/src/pjmedia-codec/speex/config.h
@@ -1,15 +1,15 @@
 
-#include <pj/config.h>
+/* Check if we need to use the fixed point version */
+#if !defined(PJ_HAS_FLOATING_POINT) || PJ_HAS_FLOATING_POINT==0
+#   define FIXED_POINT
+#endif
+
 
 #define inline __inline
 #define restrict
 
 #include "misc.h"
 
-#if !defined(PJ_HAS_FLOATING_POINT) || PJ_HAS_FLOATING_POINT==0
-#   define FIXED_POINT
-#endif
-
 #ifdef _MSC_VER
 #   pragma warning(disable: 4100)   // unreferenced formal parameter
 #   pragma warning(disable: 4101)   // unreferenced local variable
diff --git a/pjmedia/src/pjmedia-codec/speex/fftwrap.c b/pjmedia/src/pjmedia-codec/speex/fftwrap.c
index de09356..79a1de3 100644
--- a/pjmedia/src/pjmedia-codec/speex/fftwrap.c
+++ b/pjmedia/src/pjmedia-codec/speex/fftwrap.c
@@ -42,6 +42,7 @@
 
 #include "misc.h"
 
+#define MAX_FFT_SIZE 2048
 
 #ifdef FIXED_POINT
 static int maximize_range(spx_word16_t *in, spx_word16_t *out, spx_word16_t bound, int len)
@@ -225,9 +226,8 @@
 #endif
 
 
-int fixed_point = 1;
 #ifdef FIXED_POINT
-#include "smallft.h"
+/*#include "smallft.h"*/
 
 
 void spx_fft_float(void *table, float *in, float *out)
@@ -239,13 +239,19 @@
    int N = ((struct kiss_config *)table)->N;
 #else
 #endif
+#ifdef VAR_ARRAYS
    spx_word16_t _in[N];
    spx_word16_t _out[N];
+#else
+   spx_word16_t _in[MAX_FFT_SIZE];
+   spx_word16_t _out[MAX_FFT_SIZE];
+#endif
    for (i=0;i<N;i++)
       _in[i] = (int)floor(.5+in[i]);
    spx_fft(table, _in, _out);
    for (i=0;i<N;i++)
       out[i] = _out[i];
+#if 0
    if (!fixed_point)
    {
       float scale;
@@ -257,6 +263,7 @@
       spx_drft_forward(&t, out);
       spx_drft_clear(&t);
    }
+#endif
 }
 
 void spx_ifft_float(void *table, float *in, float *out)
@@ -268,13 +275,19 @@
    int N = ((struct kiss_config *)table)->N;
 #else
 #endif
+#ifdef VAR_ARRAYS
    spx_word16_t _in[N];
    spx_word16_t _out[N];
+#else
+   spx_word16_t _in[MAX_FFT_SIZE];
+   spx_word16_t _out[MAX_FFT_SIZE];
+#endif
    for (i=0;i<N;i++)
       _in[i] = (int)floor(.5+in[i]);
    spx_ifft(table, _in, _out);
    for (i=0;i<N;i++)
       out[i] = _out[i];
+#if 0
    if (!fixed_point)
    {
       int i;
@@ -285,6 +298,7 @@
       spx_drft_backward(&t, out);
       spx_drft_clear(&t);
    }
+#endif
 }
 
 #else
diff --git a/pjmedia/src/pjmedia-codec/speex/filters.c b/pjmedia/src/pjmedia-codec/speex/filters.c
index abc8d9c..73cb391 100644
--- a/pjmedia/src/pjmedia-codec/speex/filters.c
+++ b/pjmedia/src/pjmedia-codec/speex/filters.c
@@ -75,25 +75,35 @@
    }
 }
 
-void signal_div(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len)
+void signal_div(const spx_word16_t *x, spx_word16_t *y, spx_word32_t scale, int len)
 {
    int i;
    if (scale > SHL32(EXTEND32(SIG_SCALING), 8))
    {
       spx_word16_t scale_1;
       scale = PSHR32(scale, SIG_SHIFT);
-      scale_1 = EXTRACT16(DIV32_16(SHL32(EXTEND32(SIG_SCALING),7),scale));
+      scale_1 = EXTRACT16(PDIV32_16(SHL32(EXTEND32(SIG_SCALING),7),scale));
       for (i=0;i<len;i++)
       {
-         y[i] = SHR32(MULT16_16(scale_1, EXTRACT16(SHR32(x[i],SIG_SHIFT))),7);
+         y[i] = MULT16_16_P15(scale_1, x[i]);
       }
-   } else {
+   } else if (scale > SHR32(EXTEND32(SIG_SCALING), 2)) {
       spx_word16_t scale_1;
       scale = PSHR32(scale, SIG_SHIFT-5);
       scale_1 = DIV32_16(SHL32(EXTEND32(SIG_SCALING),3),scale);
       for (i=0;i<len;i++)
       {
-         y[i] = MULT16_16(scale_1, EXTRACT16(SHR32(x[i],SIG_SHIFT-2)));
+         y[i] = PSHR32(MULT16_16(scale_1, SHL16(x[i],2)),8);
+      }
+   } else {
+      spx_word16_t scale_1;
+      scale = PSHR32(scale, SIG_SHIFT-7);
+      if (scale < 5)
+         scale = 5;
+      scale_1 = DIV32_16(SHL32(EXTEND32(SIG_SCALING),3),scale);
+      for (i=0;i<len;i++)
+      {
+         y[i] = PSHR32(MULT16_16(scale_1, SHL16(x[i],2)),6);
       }
    }
 }
@@ -160,9 +170,56 @@
       sum = ADD32(sum,SHR32(sum2,6));
    }
    
-   return EXTRACT16(SHR32(SHL32(EXTEND32(spx_sqrt(1+DIV32(sum,len))),(sig_shift+3)),SIG_SHIFT));
+   return EXTRACT16(PSHR32(SHL32(EXTEND32(spx_sqrt(DIV32(sum,len))),(sig_shift+3)),SIG_SHIFT));
 }
 
+spx_word16_t compute_rms16(const spx_word16_t *x, int len)
+{
+   int i;
+   spx_word16_t max_val=10; 
+
+   for (i=0;i<len;i++)
+   {
+      spx_sig_t tmp = x[i];
+      if (tmp<0)
+         tmp = -tmp;
+      if (tmp > max_val)
+         max_val = tmp;
+   }
+   if (max_val>16383)
+   {
+      spx_word32_t sum=0;
+      for (i=0;i<len;i+=4)
+      {
+         spx_word32_t sum2=0;
+         sum2 = MAC16_16(sum2,PSHR16(x[i],1),PSHR16(x[i],1));
+         sum2 = MAC16_16(sum2,PSHR16(x[i+1],1),PSHR16(x[i+1],1));
+         sum2 = MAC16_16(sum2,PSHR16(x[i+2],1),PSHR16(x[i+2],1));
+         sum2 = MAC16_16(sum2,PSHR16(x[i+3],1),PSHR16(x[i+3],1));
+         sum = ADD32(sum,SHR32(sum2,6));
+      }
+      return SHL16(spx_sqrt(DIV32(sum,len)),4);
+   } else {
+      spx_word32_t sum=0;
+      int sig_shift=0;
+      if (max_val < 8192)
+         sig_shift=1;
+      if (max_val < 4096)
+         sig_shift=2;
+      if (max_val < 2048)
+         sig_shift=3;
+      for (i=0;i<len;i+=4)
+      {
+         spx_word32_t sum2=0;
+         sum2 = MAC16_16(sum2,SHL16(x[i],sig_shift),SHL16(x[i],sig_shift));
+         sum2 = MAC16_16(sum2,SHL16(x[i+1],sig_shift),SHL16(x[i+1],sig_shift));
+         sum2 = MAC16_16(sum2,SHL16(x[i+2],sig_shift),SHL16(x[i+2],sig_shift));
+         sum2 = MAC16_16(sum2,SHL16(x[i+3],sig_shift),SHL16(x[i+3],sig_shift));
+         sum = ADD32(sum,SHR32(sum2,6));
+      }
+      return SHL16(spx_sqrt(DIV32(sum,len)),3-sig_shift);   
+   }
+}
 
 #ifndef OVERRIDE_NORMALIZE16
 int normalize16(const spx_sig_t *x, spx_word16_t *y, spx_sig_t max_scale, int len)
@@ -206,6 +263,10 @@
    }
    return sqrt(.1+sum/len);
 }
+spx_word16_t compute_rms16(const spx_word16_t *x, int len)
+{
+   return compute_rms(x, len);
+}
 #endif
 
 
@@ -236,6 +297,8 @@
    int i,j;
    spx_sig_t xi,yi,nyi;
 
+   for (i=0;i<ord;i++)
+      mem[i] = SHR32(mem[i],1);   
    for (i=0;i<N;i++)
    {
       xi=SATURATE(x[i],805306368);
@@ -248,10 +311,40 @@
       mem[ord-1] = SUB32(MULT16_32_Q15(num[ord-1],xi), MULT16_32_Q15(den[ord-1],yi));
       y[i] = yi;
    }
+   for (i=0;i<ord;i++)
+      mem[i] = SHL32(mem[i],1);   
 }
 #endif
 #endif
 
+#ifdef FIXED_POINT
+#ifndef OVERRIDE_FILTER_MEM16
+void filter_mem16(const spx_word16_t *x, const spx_coef_t *num, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack)
+{
+   int i,j;
+   spx_word16_t xi,yi,nyi;
+   for (i=0;i<N;i++)
+   {
+      xi= x[i];
+      yi = EXTRACT16(SATURATE(ADD32(EXTEND32(x[i]),PSHR32(mem[0],LPC_SHIFT)),32767));
+      nyi = NEG16(yi);
+      for (j=0;j<ord-1;j++)
+      {
+         mem[j] = MAC16_16(MAC16_16(mem[j+1], num[j],xi), den[j],nyi);
+      }
+      mem[ord-1] = ADD32(MULT16_16(num[ord-1],xi), MULT16_16(den[ord-1],nyi));
+      y[i] = yi;
+   }
+}
+#endif
+#else
+void filter_mem16(const spx_word16_t *x, const spx_coef_t *num, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack)
+{
+   filter_mem2(x, num, den, y, N, ord, mem);
+}
+#endif
+
+
 #ifndef OVERRIDE_IIR_MEM2
 #ifdef PRECISION16
 void iir_mem2(const spx_sig_t *x, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
@@ -277,6 +370,8 @@
    int i,j;
    spx_word32_t xi,yi,nyi;
 
+   for (i=0;i<ord;i++)
+      mem[i] = SHR32(mem[i],1);   
    for (i=0;i<N;i++)
    {
       xi=SATURATE(x[i],805306368);
@@ -289,10 +384,40 @@
       mem[ord-1] = MULT16_32_Q15(den[ord-1],nyi);
       y[i] = yi;
    }
+   for (i=0;i<ord;i++)
+      mem[i] = SHL32(mem[i],1);   
 }
 #endif
 #endif
 
+#ifdef FIXED_POINT
+#ifndef OVERRIDE_IIR_MEM16
+void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack)
+{
+   int i,j;
+   spx_word16_t yi,nyi;
+
+   for (i=0;i<N;i++)
+   {
+      yi = EXTRACT16(SATURATE(ADD32(EXTEND32(x[i]),PSHR32(mem[0],LPC_SHIFT)),32767));
+      nyi = NEG16(yi);
+      for (j=0;j<ord-1;j++)
+      {
+         mem[j] = MAC16_16(mem[j+1],den[j],nyi);
+      }
+      mem[ord-1] = MULT16_16(den[ord-1],nyi);
+      y[i] = yi;
+   }
+}
+#endif
+#else
+void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack)
+{
+   iir_mem2(x, den, y, N, ord, mem);
+}
+#endif
+
+
 #ifndef OVERRIDE_FIR_MEM2
 #ifdef PRECISION16
 void fir_mem2(const spx_sig_t *x, const spx_coef_t *num, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
@@ -318,6 +443,8 @@
    int i,j;
    spx_word32_t xi,yi;
 
+   for (i=0;i<ord;i++)
+      mem[i] = SHR32(mem[i],1);   
    for (i=0;i<N;i++)
    {
       xi=SATURATE(x[i],805306368);
@@ -329,11 +456,38 @@
       mem[ord-1] = MULT16_32_Q15(num[ord-1],xi);
       y[i] = SATURATE(yi,805306368);
    }
+   for (i=0;i<ord;i++)
+      mem[i] = SHL32(mem[i],1);   
 }
 #endif
 #endif
 
+#ifdef FIXED_POINT
+#ifndef OVERRIDE_FIR_MEM16
+void fir_mem16(const spx_word16_t *x, const spx_coef_t *num, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack)
+{
+   int i,j;
+   spx_word16_t xi,yi;
 
+   for (i=0;i<N;i++)
+   {
+      xi=x[i];
+      yi = EXTRACT16(SATURATE(ADD32(EXTEND32(x[i]),PSHR32(mem[0],LPC_SHIFT)),32767));
+      for (j=0;j<ord-1;j++)
+      {
+         mem[j] = MAC16_16(mem[j+1], num[j],xi);
+      }
+      mem[ord-1] = MULT16_16(num[ord-1],xi);
+      y[i] = yi;
+   }
+}
+#endif
+#else
+void fir_mem16(const spx_word16_t *x, const spx_coef_t *num, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack)
+{
+   fir_mem2(x, num, y, N, ord, mem);
+}
+#endif
 
 
 
@@ -382,14 +536,13 @@
    i++;
    for (;i<N;i++)
       y[i] = VERY_SMALL;
-   
    for (i=0;i<ord;i++)
       mem1[i] = mem2[i] = 0;
    for (i=0;i<N;i++)
    {
       y1 = ADD16(y[i], EXTRACT16(PSHR32(mem1[0],LPC_SHIFT)));
       ny1i = NEG16(y1);
-      y[i] = ADD16(SHL16(y1,1), EXTRACT16(PSHR32(mem2[0],LPC_SHIFT)));
+      y[i] = PSHR32(ADD32(SHL32(EXTEND32(y1),LPC_SHIFT+1),mem2[0]),LPC_SHIFT);
       ny2i = NEG16(y[i]);
       for (j=0;j<ord-1;j++)
       {
@@ -426,12 +579,14 @@
       y2[k]=0;
       for (j=0;j<M2;j++)
       {
-         y1[k]=ADD32(y1[k],SHR(MULT16_16(a[j],ADD16(x[i+j],x2[i-j])),1));
-         y2[k]=SUB32(y2[k],SHR(MULT16_16(a[j],SUB16(x[i+j],x2[i-j])),1));
+         y1[k]=ADD32(y1[k],MULT16_16(a[j],ADD16(x[i+j],x2[i-j])));
+         y2[k]=SUB32(y2[k],MULT16_16(a[j],SUB16(x[i+j],x2[i-j])));
          j++;
-         y1[k]=ADD32(y1[k],SHR(MULT16_16(a[j],ADD16(x[i+j],x2[i-j])),1));
-         y2[k]=ADD32(y2[k],SHR(MULT16_16(a[j],SUB16(x[i+j],x2[i-j])),1));
+         y1[k]=ADD32(y1[k],MULT16_16(a[j],ADD16(x[i+j],x2[i-j])));
+         y2[k]=ADD32(y2[k],MULT16_16(a[j],SUB16(x[i+j],x2[i-j])));
       }
+      y1[k] = SHR32(y1[k],1);
+      y2[k] = SHR32(y2[k],1);
    }
    for (i=0;i<M-1;i++)
      mem[i]=SATURATE(PSHR(xx[N-i-1],1),16383);
@@ -450,7 +605,7 @@
    ALLOC(xx, M+N-1, spx_word16_t);
 
    for (i = 0; i < N/2; i++)
-      xx[2*i] = SHR(x[N/2-1-i],SIG_SHIFT+1);
+      xx[2*i] = PSHR32(x[N/2-1-i],SIG_SHIFT);
    for (i = 0; i < M - 1; i += 2)
       xx[N+i] = mem[i+1];
 
@@ -469,19 +624,19 @@
          a1 = a[j+1];
          x1 = xx[N-2+j-i];
 
-         y0 = ADD32(y0,SHR(MULT16_16(a0, x1),1));
-         y1 = ADD32(y1,SHR(MULT16_16(a1, x1),1));
-         y2 = ADD32(y2,SHR(MULT16_16(a0, x0),1));
-         y3 = ADD32(y3,SHR(MULT16_16(a1, x0),1));
+         y0 = ADD32(y0,SHR(MULT16_16(a0, x1),2));
+         y1 = ADD32(y1,SHR(MULT16_16(a1, x1),2));
+         y2 = ADD32(y2,SHR(MULT16_16(a0, x0),2));
+         y3 = ADD32(y3,SHR(MULT16_16(a1, x0),2));
 
          a0 = a[j+2];
          a1 = a[j+3];
          x0 = xx[N+j-i];
 
-         y0 = ADD32(y0,SHR(MULT16_16(a0, x0),1));
-         y1 = ADD32(y1,SHR(MULT16_16(a1, x0),1));
-         y2 = ADD32(y2,SHR(MULT16_16(a0, x1),1));
-         y3 = ADD32(y3,SHR(MULT16_16(a1, x1),1));
+         y0 = ADD32(y0,SHR(MULT16_16(a0, x0),2));
+         y1 = ADD32(y1,SHR(MULT16_16(a1, x0),2));
+         y2 = ADD32(y2,SHR(MULT16_16(a0, x1),2));
+         y3 = ADD32(y3,SHR(MULT16_16(a1, x1),2));
       }
       y[i] = y0;
       y[i+1] = y1;
@@ -493,113 +648,229 @@
       mem[i+1] = xx[i];
 }
 
-void comb_filter_mem_init (CombFilterMem *mem)
-{
-   mem->last_pitch=0;
-   mem->last_pitch_gain[0]=mem->last_pitch_gain[1]=mem->last_pitch_gain[2]=0;
-   mem->smooth_gain=1;
-}
-
 #ifdef FIXED_POINT
-#define COMB_STEP 32767
+#if 0
+spx_word16_t shift_filt[3][7] = {{-33,    1043,   -4551,   19959,   19959,   -4551,    1043},
+                                 {-98,    1133,   -4425,   29179,    8895,   -2328,     444},
+                                 {444,   -2328,    8895,   29179,   -4425,    1133,     -98}};
 #else
-#define COMB_STEP 1.0
+spx_word16_t shift_filt[3][7] = {{-390,    1540,   -4993,   20123,   20123,   -4993,    1540},
+                                {-1064,    2817,   -6694,   31589,    6837,    -990,    -209},
+                                 {-209,    -990,    6837,   31589,   -6694,    2817,   -1064}};
+#endif
+#else
+#if 0
+float shift_filt[3][7] = {{-9.9369e-04, 3.1831e-02, -1.3889e-01, 6.0910e-01, 6.0910e-01, -1.3889e-01, 3.1831e-02},
+                          {-0.0029937, 0.0345613, -0.1350474, 0.8904793, 0.2714479, -0.0710304, 0.0135403},
+                          {0.0135403, -0.0710304, 0.2714479, 0.8904793, -0.1350474, 0.0345613,  -0.0029937}};
+#else
+float shift_filt[3][7] = {{-0.011915, 0.046995, -0.152373, 0.614108, 0.614108, -0.152373, 0.046995},
+                          {-0.0324855, 0.0859768, -0.2042986, 0.9640297, 0.2086420, -0.0302054, -0.0063646},
+                          {-0.0063646, -0.0302054, 0.2086420, 0.9640297, -0.2042986, 0.0859768, -0.0324855}};
+#endif
 #endif
 
-void comb_filter(
-spx_sig_t *exc,          /*decoded excitation*/
-spx_sig_t *new_exc,      /*enhanced excitation*/
+int interp_pitch(
+spx_word16_t *exc,          /*decoded excitation*/
+spx_word16_t *interp,          /*decoded excitation*/
+int pitch,               /*pitch period*/
+int len
+)
+{
+   int i,j,k;
+   spx_word32_t corr[4][7];
+   spx_word32_t maxcorr;
+   int maxi, maxj;
+   for (i=0;i<7;i++)
+   {
+      corr[0][i] = inner_prod(exc, exc-pitch-3+i, len);
+   }
+   for (i=0;i<3;i++)
+   {
+      for (j=0;j<7;j++)
+      {
+         int i1, i2;
+         spx_word32_t tmp=0;
+         i1 = 3-j;
+         if (i1<0)
+            i1 = 0;
+         i2 = 10-j;
+         if (i2>7)
+            i2 = 7;
+         for (k=i1;k<i2;k++)
+            tmp += MULT16_32_Q15(shift_filt[i][k],corr[0][j+k-3]);
+         corr[i+1][j] = tmp;
+      }
+   }
+   maxi=maxj=0;
+   maxcorr = corr[0][0];
+   for (i=0;i<4;i++)
+   {
+      for (j=0;j<7;j++)
+      {
+         if (corr[i][j] > maxcorr)
+         {
+            maxcorr = corr[i][j];
+            maxi=i;
+            maxj=j;
+         }
+      }
+   }
+   for (i=0;i<len;i++)
+   {
+      spx_word32_t tmp = 0;
+      if (maxi>0)
+      {
+         for (k=0;k<7;k++)
+         {
+            tmp += MULT16_16(exc[i-(pitch-maxj+3)+k-3],shift_filt[maxi-1][k]);
+         }
+      } else {
+         tmp = SHL32(exc[i-(pitch-maxj+3)],15);
+      }
+      interp[i] = PSHR32(tmp,15);
+   }
+   return pitch-maxj+3;
+}
+
+void multicomb(
+spx_word16_t *exc,          /*decoded excitation*/
+spx_word16_t *new_exc,      /*enhanced excitation*/
 spx_coef_t *ak,           /*LPC filter coefs*/
 int p,               /*LPC order*/
 int nsf,             /*sub-frame size*/
 int pitch,           /*pitch period*/
-spx_word16_t *pitch_gain,   /*pitch gain (3-tap)*/
+int max_pitch,
 spx_word16_t  comb_gain,    /*gain of comb filter*/
-CombFilterMem *mem
+char *stack
 )
 {
-   int i;
-   spx_word16_t exc_energy=0, new_exc_energy=0;
-   spx_word16_t gain;
-   spx_word16_t step;
-   spx_word16_t fact;
-
-   /*Compute excitation amplitude prior to enhancement*/
-   exc_energy = compute_rms(exc, nsf);
-   /*for (i=0;i<nsf;i++)
-     exc_energy+=((float)exc[i])*exc[i];*/
-
-   /*Some gain adjustment if pitch is too high or if unvoiced*/
-#ifdef FIXED_POINT
-   {
-      spx_word16_t g = gain_3tap_to_1tap(pitch_gain)+gain_3tap_to_1tap(mem->last_pitch_gain);
-      if (g > 166)
-         comb_gain = MULT16_16_Q15(DIV32_16(SHL32(EXTEND32(165),15),g), comb_gain);
-      if (g < 64)
-         comb_gain = MULT16_16_Q15(SHL16(g, 9), comb_gain);
-   }
-#else
-   {
-      float g=0;
-      g = GAIN_SCALING_1*.5*(gain_3tap_to_1tap(pitch_gain)+gain_3tap_to_1tap(mem->last_pitch_gain));
-      if (g>1.3)
-         comb_gain*=1.3/g;
-      if (g<.5)
-         comb_gain*=2.*g;
-   }
-#endif
-   step = DIV32(COMB_STEP, nsf);
-   fact=0;
-
-   /*Apply pitch comb-filter (filter out noise between pitch harmonics)*/
-   for (i=0;i<nsf;i++)
-   {
-      spx_word32_t exc1, exc2;
-
-      fact = ADD16(fact,step);
-      
-      exc1 = SHL32(MULT16_32_Q15(SHL16(pitch_gain[0],7),exc[i-pitch+1]) +
-                 MULT16_32_Q15(SHL16(pitch_gain[1],7),exc[i-pitch]) +
-                 MULT16_32_Q15(SHL16(pitch_gain[2],7),exc[i-pitch-1]) , 2);
-      exc2 = SHL32(MULT16_32_Q15(SHL16(mem->last_pitch_gain[0],7),exc[i-mem->last_pitch+1]) +
-                 MULT16_32_Q15(SHL16(mem->last_pitch_gain[1],7),exc[i-mem->last_pitch]) +
-                 MULT16_32_Q15(SHL16(mem->last_pitch_gain[2],7),exc[i-mem->last_pitch-1]),2);
-
-      new_exc[i] = exc[i] + MULT16_32_Q15(comb_gain, ADD32(MULT16_32_Q15(fact,exc1), MULT16_32_Q15(SUB16(COMB_STEP,fact), exc2)));
-   }
-
-   mem->last_pitch_gain[0] = pitch_gain[0];
-   mem->last_pitch_gain[1] = pitch_gain[1];
-   mem->last_pitch_gain[2] = pitch_gain[2];
-   mem->last_pitch = pitch;
-
-   /*Amplitude after enhancement*/
-   new_exc_energy = compute_rms(new_exc, nsf);
-
-   if (exc_energy > new_exc_energy)
-      exc_energy = new_exc_energy;
+   int i; 
+   VARDECL(spx_word16_t *iexc);
+   spx_word16_t old_ener, new_ener;
+   int corr_pitch;
    
-   gain = DIV32_16(SHL32(EXTEND32(exc_energy),15),ADD16(1,new_exc_energy));
+   spx_word16_t iexc0_mag, iexc1_mag, exc_mag;
+   spx_word32_t corr0, corr1;
+   spx_word16_t gain0, gain1;
+   spx_word16_t pgain1, pgain2;
+   spx_word16_t c1, c2;
+   spx_word16_t g1, g2;
+   spx_word16_t ngain;
+   spx_word16_t gg1, gg2;
 
-#ifdef FIXED_POINT
-   if (gain < 16384)
-      gain = 16384;
-#else
-   if (gain < .5)
-      gain=.5;
-#endif
-
-#ifdef FIXED_POINT
-   for (i=0;i<nsf;i++)
+#if 0 /* Set to 1 to enable full pitch search */
+   int nol_pitch[6];
+   spx_word16_t nol_pitch_coef[6];
+   spx_word16_t ol_pitch_coef;
+   open_loop_nbest_pitch(exc, 20, 120, nsf, 
+                         nol_pitch, nol_pitch_coef, 6, stack);
+   corr_pitch=nol_pitch[0];
+   ol_pitch_coef = nol_pitch_coef[0];
+   /*Try to remove pitch multiples*/
+   for (i=1;i<6;i++)
    {
-      mem->smooth_gain = ADD16(MULT16_16_Q15(31457,mem->smooth_gain), MULT16_16_Q15(1311,gain));
-      new_exc[i] = MULT16_32_Q15(mem->smooth_gain, new_exc[i]);
+#ifdef FIXED_POINT
+      if ((nol_pitch_coef[i]>MULT16_16_Q15(nol_pitch_coef[0],19661)) && 
+#else
+      if ((nol_pitch_coef[i]>.6*nol_pitch_coef[0]) && 
+#endif
+         (ABS(2*nol_pitch[i]-corr_pitch)<=2 || ABS(3*nol_pitch[i]-corr_pitch)<=3 || 
+         ABS(4*nol_pitch[i]-corr_pitch)<=4 || ABS(5*nol_pitch[i]-corr_pitch)<=5))
+      {
+         corr_pitch = nol_pitch[i];
+      }
    }
 #else
-   for (i=0;i<nsf;i++)
-   {
-      mem->smooth_gain = .96*mem->smooth_gain + .04*gain;
-      new_exc[i] *= mem->smooth_gain;
-   }
+   corr_pitch = pitch;
 #endif
+   
+   ALLOC(iexc, 2*nsf, spx_word16_t);
+   
+   interp_pitch(exc, iexc, corr_pitch, 80);
+   if (corr_pitch>max_pitch)
+      interp_pitch(exc, iexc+nsf, 2*corr_pitch, 80);
+   else
+      interp_pitch(exc, iexc+nsf, -corr_pitch, 80);
+
+   /*interp_pitch(exc, iexc+2*nsf, 2*corr_pitch, 80);*/
+   
+   /*printf ("%d %d %f\n", pitch, corr_pitch, max_corr*ener_1);*/
+   iexc0_mag = spx_sqrt(1000+inner_prod(iexc,iexc,nsf));
+   iexc1_mag = spx_sqrt(1000+inner_prod(iexc+nsf,iexc+nsf,nsf));
+   exc_mag = spx_sqrt(1+inner_prod(exc,exc,nsf));
+   corr0  = inner_prod(iexc,exc,nsf);
+   if (corr0<0)
+      corr0=0;
+   corr1 = inner_prod(iexc+nsf,exc,nsf);
+   if (corr1<0)
+      corr1=0;
+#ifdef FIXED_POINT
+   /* Doesn't cost much to limit the ratio and it makes the rest easier */
+   if (SHL32(EXTEND32(iexc0_mag),6) < EXTEND32(exc_mag))
+      iexc0_mag = ADD16(1,PSHR16(exc_mag,6));
+   if (SHL32(EXTEND32(iexc1_mag),6) < EXTEND32(exc_mag))
+      iexc1_mag = ADD16(1,PSHR16(exc_mag,6));
+#endif
+   if (corr0 > MULT16_16(iexc0_mag,exc_mag))
+      pgain1 = QCONST16(1., 14);
+   else
+      pgain1 = PDIV32_16(SHL32(PDIV32(corr0, exc_mag),14),iexc0_mag);
+   if (corr1 > MULT16_16(iexc1_mag,exc_mag))
+      pgain2 = QCONST16(1., 14);
+   else
+      pgain2 = PDIV32_16(SHL32(PDIV32(corr1, exc_mag),14),iexc1_mag);
+   gg1 = PDIV32_16(SHL32(EXTEND32(exc_mag),8), iexc0_mag);
+   gg2 = PDIV32_16(SHL32(EXTEND32(exc_mag),8), iexc1_mag);
+   if (comb_gain>0)
+   {
+#ifdef FIXED_POINT
+      c1 = (MULT16_16_Q15(QCONST16(.4,15),comb_gain)+QCONST16(.07,15));
+      c2 = QCONST16(.5,15)+MULT16_16_Q14(QCONST16(1.72,14),(c1-QCONST16(.07,15)));
+#else
+      c1 = .4*comb_gain+.07;
+      c2 = .5+1.72*(c1-.07);
+#endif
+   } else 
+   {
+      c1=c2=0;
+   }
+#ifdef FIXED_POINT
+   g1 = 32767 - MULT16_16_Q13(MULT16_16_Q15(c2, pgain1),pgain1);
+   g2 = 32767 - MULT16_16_Q13(MULT16_16_Q15(c2, pgain2),pgain2);
+#else
+   g1 = 1-c2*pgain1*pgain1;
+   g2 = 1-c2*pgain2*pgain2;
+#endif
+   if (g1<c1)
+      g1 = c1;
+   if (g2<c1)
+      g2 = c1;
+   g1 = (spx_word16_t)PDIV32_16(SHL32(EXTEND32(c1),14),(spx_word16_t)g1);
+   g2 = (spx_word16_t)PDIV32_16(SHL32(EXTEND32(c1),14),(spx_word16_t)g2);
+   if (corr_pitch>max_pitch)
+   {
+      gain0 = MULT16_16_Q15(QCONST16(.7,15),MULT16_16_Q14(g1,gg1));
+      gain1 = MULT16_16_Q15(QCONST16(.3,15),MULT16_16_Q14(g2,gg2));
+   } else {
+      gain0 = MULT16_16_Q15(QCONST16(.6,15),MULT16_16_Q14(g1,gg1));
+      gain1 = MULT16_16_Q15(QCONST16(.6,15),MULT16_16_Q14(g2,gg2));
+   }
+   for (i=0;i<nsf;i++)
+      new_exc[i] = ADD16(exc[i], EXTRACT16(PSHR32(ADD32(MULT16_16(gain0,iexc[i]), MULT16_16(gain1,iexc[i+nsf])),8)));
+   /* FIXME: compute_rms16 is currently not quite accurate enough (but close) */
+   new_ener = compute_rms16(new_exc, nsf);
+   old_ener = compute_rms16(exc, nsf);
+   
+   if (old_ener < 1)
+      old_ener = 1;
+   if (new_ener < 1)
+      new_ener = 1;
+   if (old_ener > new_ener)
+      old_ener = new_ener;
+   ngain = PDIV32_16(SHL32(EXTEND32(old_ener),14),new_ener);
+   
+   for (i=0;i<nsf;i++)
+      new_exc[i] = MULT16_16_Q14(ngain, new_exc[i]);
 }
+
diff --git a/pjmedia/src/pjmedia-codec/speex/filters.h b/pjmedia/src/pjmedia-codec/speex/filters.h
index c86d189..35de5e1 100644
--- a/pjmedia/src/pjmedia-codec/speex/filters.h
+++ b/pjmedia/src/pjmedia-codec/speex/filters.h
@@ -38,8 +38,9 @@
 #include "misc.h"
 
 spx_word16_t compute_rms(const spx_sig_t *x, int len);
+spx_word16_t compute_rms16(const spx_word16_t *x, int len);
 void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len);
-void signal_div(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len);
+void signal_div(const spx_word16_t *x, spx_word16_t *y, spx_word32_t scale, int len);
 
 #ifdef FIXED_POINT
 
@@ -47,13 +48,6 @@
 
 #endif
 
-/** Combined filter memory. */
-typedef struct {
-   int   last_pitch;
-   spx_word16_t last_pitch_gain[3];
-   spx_word16_t smooth_gain;
-} CombFilterMem;
-
 
 void qmf_decomp(const spx_word16_t *xx, const spx_word16_t *aa, spx_sig_t *, spx_sig_t *y2, int N, int M, spx_word16_t *mem, char *stack);
 void fir_mem_up(const spx_sig_t *x, const spx_word16_t *a, spx_sig_t *y, int N, int M, spx_word32_t *mem, char *stack);
@@ -63,6 +57,10 @@
 void fir_mem2(const spx_sig_t *x, const spx_coef_t *num, spx_sig_t *y, int N, int ord, spx_mem_t *mem);
 void iir_mem2(const spx_sig_t *x, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem);
 
+void filter_mem16(const spx_word16_t *x, const spx_coef_t *num, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack);
+void iir_mem16(const spx_word16_t *x, const spx_coef_t *den, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack);
+void fir_mem16(const spx_word16_t *x, const spx_coef_t *num, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack);
+
 /* Apply bandwidth expansion on LPC coef */
 void bw_lpc(spx_word16_t , const spx_coef_t *lpc_in, spx_coef_t *lpc_out, int order);
 
@@ -74,19 +72,16 @@
 
 void compute_impulse_response(const spx_coef_t *ak, const spx_coef_t *awk1, const spx_coef_t *awk2, spx_word16_t *y, int N, int ord, char *stack);
 
-void comb_filter_mem_init (CombFilterMem *mem);
-
-void comb_filter(
-spx_sig_t *exc,          /*decoded excitation*/
-spx_sig_t *new_exc,      /*enhanced excitation*/
+void multicomb(
+spx_word16_t *exc,          /*decoded excitation*/
+spx_word16_t *new_exc,      /*enhanced excitation*/
 spx_coef_t *ak,           /*LPC filter coefs*/
 int p,               /*LPC order*/
 int nsf,             /*sub-frame size*/
 int pitch,           /*pitch period*/
-spx_word16_t *pitch_gain,   /*pitch gain (3-tap)*/
+int max_pitch,   /*pitch gain (3-tap)*/
 spx_word16_t  comb_gain,    /*gain of comb filter*/
-CombFilterMem *mem
+char *stack
 );
 
-
 #endif
diff --git a/pjmedia/src/pjmedia-codec/speex/filters_arm4.h b/pjmedia/src/pjmedia-codec/speex/filters_arm4.h
index b250364..ac4d7a9 100644
--- a/pjmedia/src/pjmedia-codec/speex/filters_arm4.h
+++ b/pjmedia/src/pjmedia-codec/speex/filters_arm4.h
@@ -101,6 +101,8 @@
    int i,j;
    spx_sig_t xi,yi,nyi;
 
+   for (i=0;i<ord;i++)
+      mem[i] = SHR32(mem[i],1);   
    for (i=0;i<N;i++)
    {
       int deadm, deadn, deadd, deadidx, x1, y1, dead1, dead2, dead3, dead4, dead5, dead6;
@@ -252,6 +254,8 @@
          : "cc", "memory");
    
    }
+   for (i=0;i<ord;i++)
+      mem[i] = SHL32(mem[i],1);   
 }
 
 #define OVERRIDE_IIR_MEM2
@@ -260,6 +264,9 @@
    int i,j;
    spx_sig_t xi,yi,nyi;
 
+   for (i=0;i<ord;i++)
+      mem[i] = SHR32(mem[i],1);   
+
    for (i=0;i<N;i++)
    {
       int deadm, deadd, deadidx, dead1, dead2, dead3, dead4, dead5, dead6;
@@ -376,4 +383,7 @@
          : "cc", "memory");
    
    }
+   for (i=0;i<ord;i++)
+      mem[i] = SHL32(mem[i],1);   
+
 }
diff --git a/pjmedia/src/pjmedia-codec/speex/filters_bfin.h b/pjmedia/src/pjmedia-codec/speex/filters_bfin.h
index 9f7ea6a..2180ed4 100644
--- a/pjmedia/src/pjmedia-codec/speex/filters_bfin.h
+++ b/pjmedia/src/pjmedia-codec/speex/filters_bfin.h
@@ -32,14 +32,11 @@
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#include <stdio.h>
-
 #define OVERRIDE_NORMALIZE16
 int normalize16(const spx_sig_t *x, spx_word16_t *y, spx_sig_t max_scale, int len)
 {
    spx_sig_t max_val=1;
    int sig_shift;
-
    __asm__ 
    (
    "%0 = 0;\n\t"
@@ -67,18 +64,17 @@
    (
    "I0 = %0;\n\t"
    "L0 = 0;\n\t"
-   "I1 = %1;\n\t"
-   "L1 = 0;\n\t"
+   "P1 = %1;\n\t"
    "R0 = [I0++];\n\t"
-   "LOOP norm_shift%= LC0 = %3 >> 1;\n\t"
+   "LOOP norm_shift%= LC0 = %3;\n\t"
    "LOOP_BEGIN norm_shift%=;\n\t"
-      "R1 = ASHIFT R0 by %2.L || R2 = [I0++];\n\t"
-      "R3 = ASHIFT R2 by %2.L || R0 = [I0++];\n\t"
-      "R3 = PACK(R3.L, R1.L);\n\t"
-      "[I1++] = R3;\n\t"
+      "R1 = ASHIFT R0 by %2.L || R0 = [I0++];\n\t"
+      "W[P1++] = R1;\n\t"
    "LOOP_END norm_shift%=;\n\t"
-   : : "a" (x), "a" (y), "d" (-sig_shift), "a" (len)
-   : "I0", "L0", "I1", "L1", "R0", "R1", "R2", "R3", "memory"
+   "R1 = ASHIFT R0 by %2.L;\n\t"
+   "W[P1++] = R1;\n\t"
+   : : "a" (x), "a" (y), "d" (-sig_shift), "a" (len-1)
+   : "I0", "L0", "P1", "R0", "R1", "memory"
    );
    return sig_shift;
 }
@@ -103,26 +99,26 @@
    
    "P0 = %3;\n\t"
    "I0 = P0;\n\t"
-   "B0 = P0;\n\t"
+   "B0 = P0;\n\t" /* numden */
    "L0 = 0;\n\t"
       
-   "P2 = %0;\n\t"
+   "P2 = %0;\n\t" /* Fused xy */
    "I2 = P2;\n\t"
    "L2 = 0;\n\t"
    
-   "P4 = %6;\n\t"
-   "P0 = %1;\n\t"
-   "P1 = %2;\n\t"
+   "P4 = %6;\n\t" /* mem */
+   "P0 = %1;\n\t" /* _x */
+   "P1 = %2;\n\t" /* _y */
    
    /* First sample */
    "R1 = [P4++];\n\t"
-   "R1 <<= 1;\n\t"
-   "R2 = [P0++];\n\t"
+   "R1 <<= 1;\n\t" /* shift mem */
+   "R2 = [P0++];\n\t" /* load x[0] */
    "R1 = R1 + R2;\n\t"
-   "[P1++] = R1;\n\t"
+   "[P1++] = R1;\n\t" /* store y[0] */
    "R1 <<= 2;\n\t"
    "R2 <<= 2;\n\t"
-   "R2 = PACK(R1.H, R2.H);\n\t"
+   "R2 = PACK(R1.H, R2.H);\n\t" /* pack x16 and y16 */
    "[P2] = R2;\n\t"
                
    /* Samples 1 to ord-1 (using memory) */
@@ -147,13 +143,13 @@
       "LOOP_END filter_start_inner%=;\n\t"
       "A0 += A1;\n\t"
       "R4 = A0;\n\t"
-      "R4 <<= 1;\n\t"
-      "R2 = [P0++];\n\t"
+      "R4 <<= 1;\n\t" /* shift mem */
+      "R2 = [P0++];\n\t" /* load x */
       "R4 = R4 + R2;\n\t"
-      "[P1++] = R4;\n\t"
+      "[P1++] = R4;\n\t" /* store y */
       "R4 <<= 2;\n\t"
       "R2 <<= 2;\n\t"
-      "R2 = PACK(R4.H, R2.H);\n\t"
+      "R2 = PACK(R4.H, R2.H);\n\t" /* pack x16 and y16 */
       "[P2] = R2;\n\t"
 
    "LOOP_END filter_start%=;\n\t"
@@ -161,14 +157,14 @@
    /* Samples ord to N*/   
    "R0 = %5;\n\t"
    "R0 <<= 1;\n\t"
-   "I0 = B0;\n\t"
+   "I0 = B0;\n\t" /* numden */
    "R0 <<= 1;\n\t"   
    "L0 = R0;\n\t"
    
-   "R0 = %5;\n\t"
-   "R2 = %4;\n\t"
+   "R0 = %5;\n\t" /* org */
+   "R2 = %4;\n\t" /* N */
    "R2 = R2 - R0;\n\t"
-   "R4 = [I0++];\n\t"
+   "R4 = [I0++];\n\t" /* numden */
    "LC0 = R2;\n\t"
    "P3 = R0;\n\t"
    "R0 <<= 2;\n\t"
@@ -176,7 +172,7 @@
    "I2 = P2;\n\t"
    "M0 = R0;\n\t"
    "A1 = A0 = 0;\n\t"
-   "R5 = [I2--];\n\t"
+   "R5 = [I2--];\n\t" /* load xy */
    "LOOP filter_mid%= LC0;\n\t"
    "LOOP_BEGIN filter_mid%=;\n\t"
       "LOOP filter_mid_inner%= LC1=P3;\n\t"
@@ -184,9 +180,9 @@
          "A1 -= R4.H*R5.H, A0 += R4.L*R5.L (IS) || R4 = [I0++] || R5 = [I2--];\n\t"
       "LOOP_END filter_mid_inner%=;\n\t"
       "R0 = (A0 += A1) || I2 += M0;\n\t"
-      "R0 = R0 << 1 || R5 = [P0++];\n\t"
+      "R0 = R0 << 1 || R5 = [P0++];\n\t" /* load x */
       "R0 = R0 + R5;\n\t"
-      "R0 = R0 << 2 || [P1++] = R0;\n\t"
+      "R0 = R0 << 2 || [P1++] = R0;\n\t" /* shift y | store y */
       "R5 = R5 << 2;\n\t"
       "R5 = PACK(R0.H, R5.H);\n\t"
       "A1 = A0 = 0 || [I2--] = R5\n\t"
@@ -222,6 +218,150 @@
 }
 
 
+#define OVERRIDE_FILTER_MEM16
+void filter_mem16(const spx_word16_t *_x, const spx_coef_t *num, const spx_coef_t *den, spx_word16_t *_y, int N, int ord, spx_mem_t *mem, char *stack)
+{
+   VARDECL(spx_word32_t *xy2);
+   VARDECL(spx_word32_t *numden_a);
+   spx_word32_t *xy;
+   spx_word16_t *numden;
+   int i;
+
+   ALLOC(xy2, (N+1), spx_word32_t);
+   ALLOC(numden_a, (2*ord+2), spx_word32_t);
+   xy = xy2+1;  
+   numden = (spx_word16_t*) numden_a;
+
+   for (i=0;i<ord;i++)
+   {
+      numden[2*i] = num[i];
+      numden[2*i+1] = den[i];
+   }
+   __asm__ __volatile__
+   (
+   /* Register setup */
+   "R0 = %5;\n\t"      /*ord */
+   
+   "P0 = %3;\n\t"
+   "I0 = P0;\n\t"
+   "B0 = P0;\n\t" /* numden */
+   "L0 = 0;\n\t"
+      
+   "P2 = %0;\n\t" /* Fused xy */
+   "I2 = P2;\n\t"
+   "L2 = 0;\n\t"
+   
+   "P4 = %6;\n\t" /* mem */
+   "P0 = %1;\n\t" /* _x */
+   "P1 = %2;\n\t" /* _y */
+   
+   /* First sample */
+   "R1 = [P4++];\n\t"
+   "R1 <<= 3;\n\t" /* shift mem */
+   "R1.L = R1 (RND);\n\t"
+   "R2 = W[P0++];\n\t" /* load x[0] */
+   "R1.L = R1.L + R2.L;\n\t"
+   "W[P1++] = R1;\n\t" /* store y[0] */
+   "R2 = PACK(R1.L, R2.L);\n\t" /* pack x16 and y16 */
+   "[P2] = R2;\n\t"
+               
+   /* Samples 1 to ord-1 (using memory) */
+   "R0 += -1;\n\t"
+   "R3 = 0;\n\t"
+   "LC0 = R0;\n\t"
+   "LOOP filter_start%= LC0;\n\t"
+   "LOOP_BEGIN filter_start%=;\n\t"
+      "R3 += 1;\n\t"
+      "LC1 = R3;\n\t"
+      
+      "R1 = [P4++];\n\t"
+      "A1 = R1;\n\t"
+      "A0 = 0;\n\t"
+      "I0 = B0;\n\t"
+      "I2 = P2;\n\t"
+      "P2 += 4;\n\t"
+      "R4 = [I0++] || R5 = [I2--];\n\t"
+      "LOOP filter_start_inner%= LC1;\n\t"
+      "LOOP_BEGIN filter_start_inner%=;\n\t"
+         "A1 -= R4.H*R5.H, A0 += R4.L*R5.L (IS) || R4 = [I0++] || R5 = [I2--];\n\t"
+      "LOOP_END filter_start_inner%=;\n\t"
+      "A0 += A1;\n\t"
+      "R4 = A0;\n\t"
+      "R4 <<= 3;\n\t" /* shift mem */
+      "R4.L = R4 (RND);\n\t"
+      "R2 = W[P0++];\n\t" /* load x */
+      "R4.L = R4.L + R2.L;\n\t"
+      "W[P1++] = R4;\n\t" /* store y */
+      //"R4 <<= 2;\n\t"
+      //"R2 <<= 2;\n\t"
+      "R2 = PACK(R4.L, R2.L);\n\t" /* pack x16 and y16 */
+      "[P2] = R2;\n\t"
+
+   "LOOP_END filter_start%=;\n\t"
+
+   /* Samples ord to N*/   
+   "R0 = %5;\n\t"
+   "R0 <<= 1;\n\t"
+   "I0 = B0;\n\t" /* numden */
+   "R0 <<= 1;\n\t"   
+   "L0 = R0;\n\t"
+   
+   "R0 = %5;\n\t" /* org */
+   "R2 = %4;\n\t" /* N */
+   "R2 = R2 - R0;\n\t"
+   "R4 = [I0++];\n\t" /* numden */
+   "LC0 = R2;\n\t"
+   "P3 = R0;\n\t"
+   "R0 <<= 2;\n\t"
+   "R0 += 8;\n\t"
+   "I2 = P2;\n\t"
+   "M0 = R0;\n\t"
+   "A1 = A0 = 0;\n\t"
+   "R5 = [I2--];\n\t" /* load xy */
+   "LOOP filter_mid%= LC0;\n\t"
+   "LOOP_BEGIN filter_mid%=;\n\t"
+      "LOOP filter_mid_inner%= LC1=P3;\n\t"
+      "LOOP_BEGIN filter_mid_inner%=;\n\t"
+         "A1 -= R4.H*R5.H, A0 += R4.L*R5.L (IS) || R4 = [I0++] || R5 = [I2--];\n\t"
+      "LOOP_END filter_mid_inner%=;\n\t"
+      "R0 = (A0 += A1) || I2 += M0;\n\t"
+      "R0 = R0 << 3 || R5 = W[P0++];\n\t" /* load x */
+      "R0.L = R0 (RND);\n\t"
+      "R0.L = R0.L + R5.L;\n\t"
+      "R5 = PACK(R0.L, R5.L) || W[P1++] = R0;\n\t" /* shift y | store y */
+      "A1 = A0 = 0 || [I2--] = R5\n\t"
+      "LOOP_END filter_mid%=;\n\t"
+   "I2 += 4;\n\t"
+   "P2 = I2;\n\t"
+   /* Update memory */
+   "P4 = %6;\n\t"
+   "R0 = %5;\n\t"
+   "LC0 = R0;\n\t"
+   "P0 = B0;\n\t"
+   "A1 = A0 = 0;\n\t"
+   "LOOP mem_update%= LC0;\n\t"
+   "LOOP_BEGIN mem_update%=;\n\t"
+      "I2 = P2;\n\t"
+      "I0 = P0;\n\t"
+      "P0 += 4;\n\t"
+      "R0 = LC0;\n\t"
+      "LC1 = R0;\n\t"
+      "R5 = [I2--] || R4 = [I0++];\n\t"
+      "LOOP mem_accum%= LC1;\n\t"
+      "LOOP_BEGIN mem_accum%=;\n\t"
+         "A1 -= R4.H*R5.H, A0 += R4.L*R5.L (IS) || R4 = [I0++] || R5 = [I2--];\n\t"
+      "LOOP_END mem_accum%=;\n\t"
+      "R0 = (A0 += A1);\n\t"
+      "A1 = A0 = 0 || [P4++] = R0;\n\t"
+   "LOOP_END mem_update%=;\n\t"
+   "L0 = 0;\n\t"
+   : : "m" (xy), "m" (_x), "m" (_y), "m" (numden), "m" (N), "m" (ord), "m" (mem)
+   : "A0", "A1", "R0", "R1", "R2", "R3", "R4", "R5", "P0", "P1", "P2", "P3", "P4", "B0", "I0", "I2", "L0", "L2", "M0", "memory"
+   );
+
+}
+
+
 
 
 #define OVERRIDE_IIR_MEM2
@@ -346,6 +486,132 @@
 
 }
 
+
+#define OVERRIDE_IIR_MEM16
+void iir_mem16(const spx_word16_t *_x, const spx_coef_t *den, spx_word16_t *_y, int N, int ord, spx_mem_t *mem, char *stack)
+{
+   VARDECL(spx_word16_t *y);
+   spx_word16_t *yy;
+
+   ALLOC(y, (N+2), spx_word16_t);
+   yy = y+2;
+
+   __asm__ __volatile__
+   (
+   /* Register setup */
+   "R0 = %5;\n\t"      /*ord */
+   
+   "P1 = %3;\n\t"
+   "I1 = P1;\n\t"
+   "B1 = P1;\n\t"
+   "L1 = 0;\n\t"
+   
+   "P3 = %0;\n\t"
+   "I3 = P3;\n\t"
+   "L3 = 0;\n\t"
+   
+   "P4 = %6;\n\t"
+   "P0 = %1;\n\t"
+   "P1 = %2;\n\t"
+   
+   /* First sample */
+   "R1 = [P4++];\n\t"
+   "R1 = R1 << 3 (S);\n\t"
+   "R1.L = R1 (RND);\n\t"
+   "R2 = W[P0++];\n\t"
+   "R1 = R1 + R2;\n\t"
+   "W[P1++] = R1;\n\t"
+   "W[P3] = R1;\n\t"
+
+   /* Samples 1 to ord-1 (using memory) */
+   "R0 += -1;\n\t"
+   "R3 = 0;\n\t"
+   "LC0 = R0;\n\t"
+   "LOOP filter_start%= LC0;\n\t"
+   "LOOP_BEGIN filter_start%=;\n\t"
+      "R3 += 1;\n\t"
+      "LC1 = R3;\n\t"
+      
+      "R1 = [P4++];\n\t"
+      "A1 = R1;\n\t"
+      "I1 = B1;\n\t"
+      "I3 = P3;\n\t"
+      "P3 += 2;\n\t"
+      "LOOP filter_start_inner%= LC1;\n\t"
+      "LOOP_BEGIN filter_start_inner%=;\n\t"
+         "R4.L = W[I1++];\n\t"
+         "R5.L = W[I3--];\n\t"
+         "A1 -= R4.L*R5.L (IS);\n\t"
+      "LOOP_END filter_start_inner%=;\n\t"
+   
+      "R1 = A1;\n\t"
+      "R1 <<= 3;\n\t"
+      "R1.L = R1 (RND);\n\t"
+      "R2 = W[P0++];\n\t"
+      "R1 = R1 + R2;\n\t"
+      "W[P1++] = R1;\n\t"
+      "W[P3] = R1;\n\t"
+   "LOOP_END filter_start%=;\n\t"
+
+   /* Samples ord to N*/   
+   "R0 = %5;\n\t"
+   "R0 <<= 1;\n\t"
+   "I1 = B1;\n\t"
+   "L1 = R0;\n\t"
+   
+   "R0 = %5;\n\t"
+   "R2 = %4;\n\t"
+   "R2 = R2 - R0;\n\t"
+   "R4.L = W[I1++];\n\t"
+   "LC0 = R2;\n\t"
+   "LOOP filter_mid%= LC0;\n\t"
+   "LOOP_BEGIN filter_mid%=;\n\t"
+      "LC1 = R0;\n\t"
+      "A1 = 0;\n\t"
+      "I3 = P3;\n\t"
+      "P3 += 2;\n\t"
+      "R5.L = W[I3--];\n\t"
+      "LOOP filter_mid_inner%= LC1;\n\t"
+      "LOOP_BEGIN filter_mid_inner%=;\n\t"
+         "A1 -= R4.L*R5.L (IS) || R4.L = W[I1++] || R5.L = W[I3--];\n\t"
+      "LOOP_END filter_mid_inner%=;\n\t"
+      "R1 = A1;\n\t"
+      "R1 = R1 << 3 || R2 = W[P0++];\n\t"
+      "R1.L = R1 (RND);\n\t"
+      "R1 = R1 + R2;\n\t"
+      "W[P1++] = R1;\n\t"
+      "W[P3] = R1;\n\t"
+   "LOOP_END filter_mid%=;\n\t"
+     
+   /* Update memory */
+   "P4 = %6;\n\t"
+   "R0 = %5;\n\t"
+   "LC0 = R0;\n\t"
+   "P1 = B1;\n\t"
+   "LOOP mem_update%= LC0;\n\t"
+   "LOOP_BEGIN mem_update%=;\n\t"
+      "A0 = 0;\n\t"
+      "I3 = P3;\n\t"
+      "I1 = P1;\n\t"
+      "P1 += 2;\n\t"
+      "R0 = LC0;\n\t"
+      "LC1=R0;\n\t"
+      "R5.L = W[I3--] || R4.L = W[I1++];\n\t"
+      "LOOP mem_accum%= LC1;\n\t"
+      "LOOP_BEGIN mem_accum%=;\n\t"
+         "A0 -= R4.L*R5.L (IS) || R4.L = W[I1++] || R5.L = W[I3--];\n\t"
+      "LOOP_END mem_accum%=;\n\t"
+      "R0 = A0;\n\t"
+      "[P4++] = R0;\n\t"
+   "LOOP_END mem_update%=;\n\t"
+   "L1 = 0;\n\t"
+   : : "m" (yy), "m" (_x), "m" (_y), "m" (den), "m" (N), "m" (ord), "m" (mem)
+   : "A0", "A1", "R0", "R1", "R2", "R3", "R4", "R5", "P0", "P1", "P2", "P3", "P4", "B1", "I1", "I3", "L1", "L3", "memory"
+   );
+
+}
+
+
 #define OVERRIDE_FIR_MEM2
 void fir_mem2(const spx_sig_t *x, const spx_coef_t *num, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
 {
@@ -358,6 +624,18 @@
    filter_mem2(x, num, den, y, N, ord, mem);
 }
 
+#define OVERRIDE_FIR_MEM16
+void fir_mem16(const spx_word16_t *x, const spx_coef_t *num, spx_word16_t *y, int N, int ord, spx_mem_t *mem, char *stack)
+{
+   int i;
+   spx_coef_t den2[12];
+   spx_coef_t *den;
+   den = (spx_coef_t*)((((int)den2)+4)&0xfffffffc);
+   for (i=0;i<10;i++)
+      den[i] = 0;
+   filter_mem16(x, num, den, y, N, ord, mem, stack);
+}
+
 
 #define OVERRIDE_COMPUTE_IMPULSE_RESPONSE
 void compute_impulse_response(const spx_coef_t *ak, const spx_coef_t *awk1, const spx_coef_t *awk2, spx_word16_t *y, int N, int ord, char *stack)
diff --git a/pjmedia/src/pjmedia-codec/speex/fixed_bfin.h b/pjmedia/src/pjmedia-codec/speex/fixed_bfin.h
index adee43c..aa26f6a 100644
--- a/pjmedia/src/pjmedia-codec/speex/fixed_bfin.h
+++ b/pjmedia/src/pjmedia-codec/speex/fixed_bfin.h
@@ -36,11 +36,40 @@
 #ifndef FIXED_BFIN_H
 #define FIXED_BFIN_H
 
+#undef PDIV32_16
+static inline spx_word16_t PDIV32_16(spx_word32_t a, spx_word16_t b)
+{
+   spx_word32_t res, bb;
+   bb = b;
+   a += b>>1;
+   __asm__  (
+         "P0 = 15;\n\t"
+         "R0 = %1;\n\t"
+         "R1 = %2;\n\t"
+         //"R0 = R0 + R1;\n\t"
+         "R0 <<= 1;\n\t"
+         "DIVS (R0, R1);\n\t"
+         "LOOP divide%= LC0 = P0;\n\t"
+         "LOOP_BEGIN divide%=;\n\t"
+            "DIVQ (R0, R1);\n\t"
+         "LOOP_END divide%=;\n\t"
+         "R0 = R0.L;\n\t"
+         "%0 = R0;\n\t"
+   : "=m" (res)
+   : "m" (a), "m" (bb)
+   : "P0", "R0", "R1", "cc");
+   return res;
+}
+
 #undef DIV32_16
 static inline spx_word16_t DIV32_16(spx_word32_t a, spx_word16_t b)
 {
    spx_word32_t res, bb;
    bb = b;
+   /* Make the roundinf consistent with the C version 
+      (do we need to do that?)*/
+   if (a<0) 
+      a += (b-1);
    __asm__  (
          "P0 = 15;\n\t"
          "R0 = %1;\n\t"
@@ -79,14 +108,12 @@
    spx_word32_t res;
    __asm__
    (
-         "%1 <<= 1;\n\t"
-         "A1 = %2.L*%1.L (M,IS);\n\t"
-         "A1 = A1 >>> 16;\n\t"
-         "R1 = (A1 += %2.L*%1.H) (IS);\n\t"
-         "%0 = R1;\n\t"
-   : "=&d" (res), "=&d" (b)
+         "A1 = %2.L*%1.L (M);\n\t"
+         "A1 = A1 >>> 15;\n\t"
+         "%0 = (A1 += %2.L*%1.H) ;\n\t"
+   : "=&W" (res), "=&d" (b)
    : "d" (a), "1" (b)
-   : "A1", "R1"
+   : "A1"
    );
    return res;
 }
@@ -97,14 +124,13 @@
    spx_word32_t res;
    __asm__
          (
-         "%1 <<= 1;\n\t"
-         "A1 = %2.L*%1.L (M,IS);\n\t"
-         "A1 = A1 >>> 16;\n\t"
-         "R1 = (A1 += %2.L*%1.H) (IS);\n\t"
-         "%0 = R1 + %4;\n\t"
-   : "=&d" (res), "=&d" (b)
+         "A1 = %2.L*%1.L (M);\n\t"
+         "A1 = A1 >>> 15;\n\t"
+         "%0 = (A1 += %2.L*%1.H);\n\t"
+         "%0 = %0 + %4;\n\t"
+   : "=&W" (res), "=&d" (b)
    : "d" (a), "1" (b), "d" (c)
-   : "A1", "R1"
+   : "A1"
          );
    return res;
 }
@@ -115,14 +141,13 @@
    spx_word32_t res;
    __asm__
          (
-         "%2 <<= 2;\n\t"
-         "A1 = %1.L*%2.L (M,IS);\n\t"
-         "A1 = A1 >>> 16;\n\t"
-         "R1 = (A1 += %1.L*%2.H) (IS);\n\t"
-         "%0 = R1;\n\t"
-   : "=d" (res), "=d" (a), "=d" (b)
+         "%2 <<= 1;\n\t"
+         "A1 = %1.L*%2.L (M);\n\t"
+         "A1 = A1 >>> 15;\n\t"
+         "%0 = (A1 += %1.L*%2.H);\n\t"
+   : "=W" (res), "=d" (a), "=d" (b)
    : "1" (a), "2" (b)
-   : "A1", "R1"
+   : "A1"
          );
    return res;
 }
@@ -133,14 +158,14 @@
    spx_word32_t res;
    __asm__
          (
-         "%1 <<= 2;\n\t"
-         "A1 = %2.L*%1.L (M,IS);\n\t"
-         "A1 = A1 >>> 16;\n\t"
-         "R1 = (A1 += %2.L*%1.H) (IS);\n\t"
-         "%0 = R1 + %4;\n\t"
-   : "=&d" (res), "=&d" (b)
+         "%1 <<= 1;\n\t"
+         "A1 = %2.L*%1.L (M);\n\t"
+         "A1 = A1 >>> 15;\n\t"
+         "%0 = (A1 += %2.L*%1.H);\n\t"
+         "%0 = %0 + %4;\n\t"
+   : "=&W" (res), "=&d" (b)
    : "d" (a), "1" (b), "d" (c)
-   : "A1", "R1"
+   : "A1"
          );
    return res;
 }
diff --git a/pjmedia/src/pjmedia-codec/speex/fixed_debug.h b/pjmedia/src/pjmedia-codec/speex/fixed_debug.h
index 7403a18..65c5712 100644
--- a/pjmedia/src/pjmedia-codec/speex/fixed_debug.h
+++ b/pjmedia/src/pjmedia-codec/speex/fixed_debug.h
@@ -40,8 +40,9 @@
 extern long long spx_mips;
 #define MIPS_INC spx_mips++,
 
-#define QCONST16(x,bits) ((spx_word16_t)((x)*(1<<(bits))+(1<<((bits)-1))))
-#define QCONST32(x,bits) ((spx_word32_t)((x)*(1<<(bits))+(1<<((bits)-1))))
+#define QCONST16(x,bits) ((spx_word16_t)(.5+(x)*(((spx_word32_t)1)<<(bits))))
+#define QCONST32(x,bits) ((spx_word32_t)(.5+(x)*(((spx_word32_t)1)<<(bits))))
+
 
 #define VERIFY_SHORT(x) ((x)<=32767&&(x)>=-32768)
 #define VERIFY_INT(x) ((x)<=2147483647LL&&(x)>=-2147483648LL)
@@ -169,7 +170,7 @@
    }
    res = a+b;
    if (!VERIFY_SHORT(res))
-      fprintf (stderr, "ADD16: output is not short: %d\n", res);
+      fprintf (stderr, "ADD16: output is not short: %d+%d=%d\n", a,b,res);
    spx_mips++;
    return res;
 }
@@ -196,7 +197,9 @@
    }
    res = a+b;
    if (!VERIFY_INT(res))
+   {
       fprintf (stderr, "ADD32: output is not int: %d\n", (int)res);
+   }
    spx_mips++;
    return res;
 }
@@ -251,6 +254,8 @@
 #define MAC16_16(c,a,b)     (spx_mips--,ADD32((c),MULT16_16((a),(b))))
 #define MAC16_16_Q11(c,a,b)     (ADD16((c),EXTRACT16(SHR32(MULT16_16((a),(b)),11))))
 #define MAC16_16_Q13(c,a,b)     (ADD16((c),EXTRACT16(SHR32(MULT16_16((a),(b)),13))))
+#define MAC16_16_P13(c,a,b)     (ADD32((c),SHR(ADD32(4096,MULT16_16((a),(b))),13)))
+
 
 static inline int MULT16_32_QX(int a, long long b, int Q)
 {
@@ -437,7 +442,7 @@
    spx_mips+=36;
    return res;
 }
-
-
+#define PDIV32(a,b) DIV32(ADD32((a),(b)>>1),b)
+#define PDIV32_16(a,b) DIV32_16(ADD32((a),(b)>>1),b)
 
 #endif
diff --git a/pjmedia/src/pjmedia-codec/speex/fixed_generic.h b/pjmedia/src/pjmedia-codec/speex/fixed_generic.h
index d4bdc15..375050c 100644
--- a/pjmedia/src/pjmedia-codec/speex/fixed_generic.h
+++ b/pjmedia/src/pjmedia-codec/speex/fixed_generic.h
@@ -35,13 +35,13 @@
 #ifndef FIXED_GENERIC_H
 #define FIXED_GENERIC_H
 
-#define QCONST16(x,bits) ((spx_word16_t)(.5+(x)*(1<<(bits))))
-#define QCONST32(x,bits) ((spx_word32_t)(.5+(x)*(1<<(bits))))
+#define QCONST16(x,bits) ((spx_word16_t)(.5+(x)*(((spx_word32_t)1)<<(bits))))
+#define QCONST32(x,bits) ((spx_word32_t)(.5+(x)*(((spx_word32_t)1)<<(bits))))
 
 #define NEG16(x) (-(x))
 #define NEG32(x) (-(x))
-#define EXTRACT16(x) ((spx_word16_t)x)
-#define EXTEND32(x) ((spx_word32_t)x)
+#define EXTRACT16(x) ((spx_word16_t)(x))
+#define EXTEND32(x) ((spx_word32_t)(x))
 #define SHR16(a,shift) ((a) >> (shift))
 #define SHL16(a,shift) ((a) << (shift))
 #define SHR32(a,shift) ((a) >> (shift))
@@ -61,7 +61,6 @@
 #define SUB16(a,b) ((spx_word16_t)(a)-(spx_word16_t)(b))
 #define ADD32(a,b) ((spx_word32_t)(a)+(spx_word32_t)(b))
 #define SUB32(a,b) ((spx_word32_t)(a)-(spx_word32_t)(b))
-#define ADD64(a,b) ((spx_word64_t)(a)+(spx_word64_t)(b))
 
 
 /* result fits in 16 bits */
@@ -84,6 +83,7 @@
 
 #define MAC16_16_Q11(c,a,b)     (ADD32((c),SHR(MULT16_16((a),(b)),11)))
 #define MAC16_16_Q13(c,a,b)     (ADD32((c),SHR(MULT16_16((a),(b)),13)))
+#define MAC16_16_P13(c,a,b)     (ADD32((c),SHR(ADD32(4096,MULT16_16((a),(b))),13)))
 
 #define MULT16_16_Q11_32(a,b) (SHR(MULT16_16((a),(b)),11))
 #define MULT16_16_Q13(a,b) (SHR(MULT16_16((a),(b)),13))
@@ -97,6 +97,8 @@
 #define MUL_16_32_R15(a,bh,bl) ADD32(MULT16_16((a),(bh)), SHR(MULT16_16((a),(bl)),15))
 
 #define DIV32_16(a,b) ((spx_word16_t)(((spx_word32_t)(a))/((spx_word16_t)(b))))
+#define PDIV32_16(a,b) ((spx_word16_t)(((spx_word32_t)(a)+((spx_word16_t)(b)>>1))/((spx_word16_t)(b))))
 #define DIV32(a,b) (((spx_word32_t)(a))/((spx_word32_t)(b)))
+#define PDIV32(a,b) (((spx_word32_t)(a)+((spx_word16_t)(b)>>1))/((spx_word32_t)(b)))
 
 #endif
diff --git a/pjmedia/src/pjmedia-codec/speex/gain_table.c b/pjmedia/src/pjmedia-codec/speex/gain_table.c
index 54a5407..00b8244 100644
--- a/pjmedia/src/pjmedia-codec/speex/gain_table.c
+++ b/pjmedia/src/pjmedia-codec/speex/gain_table.c
@@ -29,132 +29,132 @@
    POSSIBILITY OF SUCH DAMAGE.
 */
 
-const signed char gain_cdbk_nb[384] = {
--32,-32,-32,
--28,-67,-5,
--42,-6,-32,
--57,-10,-54,
--16,27,-41,
-19,-19,-40,
--45,24,-21,
--8,-14,-18,
-1,14,-58,
--18,-88,-39,
--38,21,-18,
--19,20,-43,
-10,17,-48,
--52,-58,-13,
--44,-1,-11,
--12,-11,-34,
-14,0,-46,
--37,-35,-34,
--25,44,-30,
-6,-4,-63,
--31,43,-41,
--23,30,-43,
--43,26,-14,
--33,1,-13,
--13,18,-37,
--46,-73,-45,
--36,24,-25,
--36,-11,-20,
--25,12,-18,
--36,-69,-59,
--45,6,8,
--22,-14,-24,
--1,13,-44,
--39,-48,-26,
--32,31,-37,
--33,15,-46,
--24,30,-36,
--41,31,-23,
--50,22,-4,
--22,2,-21,
--17,30,-34,
--7,-60,-28,
--38,42,-28,
--44,-11,21,
--16,8,-44,
--39,-55,-43,
--11,-35,26,
--9,0,-34,
--8,121,-81,
-7,-16,-22,
--37,33,-31,
--27,-7,-36,
--34,70,-57,
--37,-11,-48,
--40,17,-1,
--33,6,-6,
--9,0,-20,
--21,69,-33,
--29,33,-31,
--55,12,-1,
--33,27,-22,
--50,-33,-47,
--50,54,51,
--1,-5,-44,
--4,22,-40,
--39,-66,-25,
--33,1,-26,
--24,-23,-25,
--11,21,-45,
--25,-45,-19,
--43,105,-16,
-5,-21,1,
--16,11,-33,
--13,-99,-4,
--37,33,-15,
--25,37,-63,
--36,24,-31,
--53,-56,-38,
--41,-4,4,
--33,13,-30,
-49,52,-94,
--5,-30,-15,
-1,38,-40,
--23,12,-36,
--17,40,-47,
--37,-41,-39,
--49,34,0,
--18,-7,-4,
--16,17,-27,
-30,5,-62,
-4,48,-68,
--43,11,-11,
--18,19,-15,
--23,-62,-39,
--42,10,-2,
--21,-13,-13,
--9,13,-47,
--23,-62,-24,
--44,60,-21,
--18,-3,-52,
--22,22,-36,
--75,57,16,
--19,3,10,
--29,23,-38,
--5,-62,-51,
--51,40,-18,
--42,13,-24,
--34,14,-20,
--56,-75,-26,
--26,32,15,
--26,17,-29,
--7,28,-52,
--12,-30,5,
--5,-48,-5,
-2,2,-43,
-21,16,16,
--25,-45,-32,
--43,18,-10,
-9,0,-1,
--1,7,-30,
-19,-48,-4,
--28,25,-29,
--22,0,-31,
--32,17,-10,
--64,-41,-62,
--52,15,16,
--30,-22,-32,
--7,9,-38};
+const signed char gain_cdbk_nb[512] = {
+-32, -32, -32, 0,
+-28, -67, -5, 33,
+-42, -6, -32, 18,
+-57, -10, -54, 35,
+-16, 27, -41, 42,
+19, -19, -40, 36,
+-45, 24, -21, 40,
+-8, -14, -18, 28,
+1, 14, -58, 53,
+-18, -88, -39, 39,
+-38, 21, -18, 37,
+-19, 20, -43, 38,
+10, 17, -48, 54,
+-52, -58, -13, 33,
+-44, -1, -11, 32,
+-12, -11, -34, 22,
+14, 0, -46, 46,
+-37, -35, -34, 5,
+-25, 44, -30, 43,
+6, -4, -63, 49,
+-31, 43, -41, 43,
+-23, 30, -43, 41,
+-43, 26, -14, 44,
+-33, 1, -13, 27,
+-13, 18, -37, 37,
+-46, -73, -45, 34,
+-36, 24, -25, 34,
+-36, -11, -20, 19,
+-25, 12, -18, 33,
+-36, -69, -59, 34,
+-45, 6, 8, 46,
+-22, -14, -24, 18,
+-1, 13, -44, 44,
+-39, -48, -26, 15,
+-32, 31, -37, 34,
+-33, 15, -46, 31,
+-24, 30, -36, 37,
+-41, 31, -23, 41,
+-50, 22, -4, 50,
+-22, 2, -21, 28,
+-17, 30, -34, 40,
+-7, -60, -28, 29,
+-38, 42, -28, 42,
+-44, -11, 21, 43,
+-16, 8, -44, 34,
+-39, -55, -43, 21,
+-11, -35, 26, 41,
+-9, 0, -34, 29,
+-8, 121, -81, 113,
+7, -16, -22, 33,
+-37, 33, -31, 36,
+-27, -7, -36, 17,
+-34, 70, -57, 65,
+-37, -11, -48, 21,
+-40, 17, -1, 44,
+-33, 6, -6, 33,
+-9, 0, -20, 34,
+-21, 69, -33, 57,
+-29, 33, -31, 35,
+-55, 12, -1, 49,
+-33, 27, -22, 35,
+-50, -33, -47, 17,
+-50, 54, 51, 94,
+-1, -5, -44, 35,
+-4, 22, -40, 45,
+-39, -66, -25, 24,
+-33, 1, -26, 20,
+-24, -23, -25, 12,
+-11, 21, -45, 44,
+-25, -45, -19, 17,
+-43, 105, -16, 82,
+5, -21, 1, 41,
+-16, 11, -33, 30,
+-13, -99, -4, 57,
+-37, 33, -15, 44,
+-25, 37, -63, 54,
+-36, 24, -31, 31,
+-53, -56, -38, 26,
+-41, -4, 4, 37,
+-33, 13, -30, 24,
+49, 52, -94, 114,
+-5, -30, -15, 23,
+1, 38, -40, 56,
+-23, 12, -36, 29,
+-17, 40, -47, 51,
+-37, -41, -39, 11,
+-49, 34, 0, 58,
+-18, -7, -4, 34,
+-16, 17, -27, 35,
+30, 5, -62, 65,
+4, 48, -68, 76,
+-43, 11, -11, 38,
+-18, 19, -15, 41,
+-23, -62, -39, 23,
+-42, 10, -2, 41,
+-21, -13, -13, 25,
+-9, 13, -47, 42,
+-23, -62, -24, 24,
+-44, 60, -21, 58,
+-18, -3, -52, 32,
+-22, 22, -36, 34,
+-75, 57, 16, 90,
+-19, 3, 10, 45,
+-29, 23, -38, 32,
+-5, -62, -51, 38,
+-51, 40, -18, 53,
+-42, 13, -24, 32,
+-34, 14, -20, 30,
+-56, -75, -26, 37,
+-26, 32, 15, 59,
+-26, 17, -29, 29,
+-7, 28, -52, 53,
+-12, -30, 5, 30,
+-5, -48, -5, 35,
+2, 2, -43, 40,
+21, 16, 16, 75,
+-25, -45, -32, 10,
+-43, 18, -10, 42,
+9, 0, -1, 52,
+-1, 7, -30, 36,
+19, -48, -4, 48,
+-28, 25, -29, 32,
+-22, 0, -31, 22,
+-32, 17, -10, 36,
+-64, -41, -62, 36,
+-52, 15, 16, 58,
+-30, -22, -32, 6,
+-7, 9, -38, 36};
diff --git a/pjmedia/src/pjmedia-codec/speex/gain_table_lbr.c b/pjmedia/src/pjmedia-codec/speex/gain_table_lbr.c
index 24357f0..3c1c3db 100644
--- a/pjmedia/src/pjmedia-codec/speex/gain_table_lbr.c
+++ b/pjmedia/src/pjmedia-codec/speex/gain_table_lbr.c
@@ -29,36 +29,36 @@
    POSSIBILITY OF SUCH DAMAGE.
 */
 
-const signed char gain_cdbk_lbr[96] = {
--32,-32,-32,
--31,-58,-16,
--41,-24,-43,
--56,-22,-55,
--13,33,-41,
--4,-39,-9,
--41,15,-12,
--8,-15,-12,
-1,2,-44,
--22,-66,-42,
--38,28,-23,
--21,14,-37,
-0,21,-50,
--53,-71,-27,
--37,-1,-19,
--19,-5,-28,
-6,65,-44,
--33,-48,-33,
--40,57,-14,
--17,4,-45,
--31,38,-33,
--23,28,-40,
--43,29,-12,
--34,13,-23,
--16,15,-27,
--14,-82,-15,
--31,25,-32,
--21,5,-5,
--47,-63,-51,
--46,12,3,
--28,-17,-29,
--10,14,-40};
+const signed char gain_cdbk_lbr[128] = {
+-32, -32, -32, 0,
+-31, -58, -16, 22,
+-41, -24, -43, 14,
+-56, -22, -55, 29,
+-13, 33, -41, 47,
+-4, -39, -9, 29,
+-41, 15, -12, 38,
+-8, -15, -12, 31,
+1, 2, -44, 40,
+-22, -66, -42, 27,
+-38, 28, -23, 38,
+-21, 14, -37, 31,
+0, 21, -50, 52,
+-53, -71, -27, 33,
+-37, -1, -19, 25,
+-19, -5, -28, 22,
+6, 65, -44, 74,
+-33, -48, -33, 9,
+-40, 57, -14, 58,
+-17, 4, -45, 32,
+-31, 38, -33, 36,
+-23, 28, -40, 39,
+-43, 29, -12, 46,
+-34, 13, -23, 28,
+-16, 15, -27, 34,
+-14, -82, -15, 43,
+-31, 25, -32, 29,
+-21, 5, -5, 38,
+-47, -63, -51, 33,
+-46, 12, 3, 47,
+-28, -17, -29, 11,
+-10, 14, -40, 38};
diff --git a/pjmedia/src/pjmedia-codec/speex/jitter.c b/pjmedia/src/pjmedia-codec/speex/jitter.c
index 2860c10..a4c0751 100644
--- a/pjmedia/src/pjmedia-codec/speex/jitter.c
+++ b/pjmedia/src/pjmedia-codec/speex/jitter.c
@@ -36,9 +36,6 @@
 #include "config.h"
 #endif
 
-#ifndef NULL
-#define NULL 0
-#endif
 
 #include "misc.h"
 #include <speex/speex.h>
@@ -46,124 +43,162 @@
 #include <speex/speex_jitter.h>
 #include <stdio.h>
 
-#define LATE_BINS 4
+#define LATE_BINS 10
+#define MAX_MARGIN 30                     /**< Number of bins in margin histogram */
 
-void speex_jitter_init(SpeexJitter *jitter, void *decoder, int sampling_rate)
+#define SPEEX_JITTER_MAX_BUFFER_SIZE 200   /**< Maximum number of packets in jitter buffer */
+
+
+
+#define GT32(a,b) (((spx_int32_t)((a)-(b)))>0)
+#define GE32(a,b) (((spx_int32_t)((a)-(b)))>=0)
+#define LT32(a,b) (((spx_int32_t)((a)-(b)))<0)
+#define LE32(a,b) (((spx_int32_t)((a)-(b)))<=0)
+
+/** Jitter buffer structure */
+struct JitterBuffer_ {
+   spx_uint32_t pointer_timestamp;                                        /**< Timestamp of what we will *get* next */
+   spx_uint32_t current_timestamp;                                        /**< Timestamp of the local clock (what we will *play* next) */
+
+   char *buf[SPEEX_JITTER_MAX_BUFFER_SIZE];                               /**< Buffer of packets (NULL if slot is free) */
+   spx_uint32_t timestamp[SPEEX_JITTER_MAX_BUFFER_SIZE];                  /**< Timestamp of packet                 */
+   int span[SPEEX_JITTER_MAX_BUFFER_SIZE];                                /**< Timestamp of packet                 */
+   int len[SPEEX_JITTER_MAX_BUFFER_SIZE];                                 /**< Number of bytes in packet           */
+
+   int tick_size;                                                         /**< Output granularity                  */
+   int reset_state;                                                       /**< True if state was just reset        */
+   int buffer_margin;                                                     /**< How many frames we want to keep in the buffer (lower bound) */
+   
+   int lost_count;                                                        /**< Number of consecutive lost packets  */
+   float shortterm_margin[MAX_MARGIN];                                    /**< Short term margin histogram         */
+   float longterm_margin[MAX_MARGIN];                                     /**< Long term margin histogram          */
+   float loss_rate;                                                       /**< Average loss rate                   */
+};
+
+/** Initialise jitter buffer */
+JitterBuffer *jitter_buffer_init(int tick)
+{
+   JitterBuffer *jitter = speex_alloc(sizeof(JitterBuffer));
+   if (jitter)
+   {
+      int i;
+      for (i=0;i<SPEEX_JITTER_MAX_BUFFER_SIZE;i++)
+         jitter->buf[i]=NULL;
+      jitter->tick_size = tick;
+      jitter->buffer_margin = 1;
+      jitter_buffer_reset(jitter);
+   }
+   return jitter;
+}
+
+/** Reset jitter buffer */
+void jitter_buffer_reset(JitterBuffer *jitter)
 {
    int i;
    for (i=0;i<SPEEX_JITTER_MAX_BUFFER_SIZE;i++)
    {
-      jitter->len[i]=-1;
-      jitter->timestamp[i]=-1;
+      if (jitter->buf[i])
+      {
+         speex_free(jitter->buf[i]);
+         jitter->buf[i] = NULL;
+      }
    }
-
-   jitter->dec = decoder;
-   speex_decoder_ctl(decoder, SPEEX_GET_FRAME_SIZE, &jitter->frame_size);
-   jitter->frame_time = jitter->frame_size;
-
-   speex_bits_init(&jitter->current_packet);
-   jitter->valid_bits = 0;
-
-   jitter->buffer_size = 4;
-
-   jitter->pointer_timestamp = -jitter->frame_time * jitter->buffer_size;
+   /* Timestamp is actually undefined at this point */
+   jitter->pointer_timestamp = 0;
+   jitter->current_timestamp = 0;
    jitter->reset_state = 1;
    jitter->lost_count = 0;
    jitter->loss_rate = 0;
+   for (i=0;i<MAX_MARGIN;i++)
+   {
+      jitter->shortterm_margin[i] = 0;
+      jitter->longterm_margin[i] = 0;
+   }
+   /*fprintf (stderr, "reset\n");*/
 }
 
-void speex_jitter_destroy(SpeexJitter *jitter)
+/** Destroy jitter buffer */
+void jitter_buffer_destroy(JitterBuffer *jitter)
 {
-   speex_bits_destroy(&jitter->current_packet);
+   jitter_buffer_reset(jitter);
+   speex_free(jitter);
 }
 
-
-void speex_jitter_put(SpeexJitter *jitter, char *packet, int len, int timestamp)
+/** Put one packet into the jitter buffer */
+void jitter_buffer_put(JitterBuffer *jitter, const JitterBufferPacket *packet)
 {
    int i,j;
-   int arrival_margin;
-
+   spx_int32_t arrival_margin;
+   /*fprintf (stderr, "put packet %d %d\n", timestamp, span);*/
    if (jitter->reset_state)
    {
       jitter->reset_state=0;
-      jitter->pointer_timestamp = timestamp-jitter->frame_time * jitter->buffer_size;
-      for (i=0;i<MAX_MARGIN;i++)
-      {
-         jitter->shortterm_margin[i] = 0;
-         jitter->longterm_margin[i] = 0;
-      }
-      for (i=0;i<SPEEX_JITTER_MAX_BUFFER_SIZE;i++)
-      {
-         jitter->len[i]=-1;
-         jitter->timestamp[i]=-1;
-      }
-      fprintf(stderr, "reset to %d\n", timestamp);
+      jitter->pointer_timestamp = packet->timestamp;
+      jitter->current_timestamp = packet->timestamp;
+      /*fprintf(stderr, "reset to %d\n", timestamp);*/
    }
    
    /* Cleanup buffer (remove old packets that weren't played) */
    for (i=0;i<SPEEX_JITTER_MAX_BUFFER_SIZE;i++)
    {
-      if (jitter->timestamp[i]<jitter->pointer_timestamp)
+      if (jitter->buf[i] && LE32(jitter->timestamp[i] + jitter->span[i], jitter->pointer_timestamp))
       {
-         jitter->len[i]=-1;
-         /*if (jitter->timestamp[i] != -1)
-            fprintf (stderr, "discarding %d %d\n", jitter->timestamp[i], jitter->pointer_timestamp);*/
+         /*fprintf (stderr, "cleaned (not played)\n");*/
+         speex_free(jitter->buf[i]);
+         jitter->buf[i] = NULL;
       }
    }
 
    /*Find an empty slot in the buffer*/
    for (i=0;i<SPEEX_JITTER_MAX_BUFFER_SIZE;i++)
    {
-      if (jitter->len[i]==-1)
+      if (jitter->buf[i]==NULL)
          break;
    }
 
    /*fprintf(stderr, "%d %d %f\n", timestamp, jitter->pointer_timestamp, jitter->drift_average);*/
+   /*No place left in the buffer*/
    if (i==SPEEX_JITTER_MAX_BUFFER_SIZE)
    {
       int earliest=jitter->timestamp[0];
       i=0;
       for (j=1;j<SPEEX_JITTER_MAX_BUFFER_SIZE;j++)
       {
-         if (jitter->timestamp[j]<earliest)
+         if (!jitter->buf[i] || LT32(jitter->timestamp[j],earliest))
          {
             earliest = jitter->timestamp[j];
             i=j;
          }
       }
-      /*fprintf (stderr, "Buffer is full, discarding earliest frame %d (currently at %d)\n", timestamp, jitter->pointer_timestamp);*/
-      /*No place left in the buffer*/
-      
-      /*skip some frame(s) */
-      /*return;*/
+      speex_free(jitter->buf[i]);
+      jitter->buf[i]=NULL;
+      if (jitter->lost_count>20)
+      {
+         jitter_buffer_reset(jitter);
+      }
+      /*fprintf (stderr, "Buffer is full, discarding earliest frame %d (currently at %d)\n", timestamp, jitter->pointer_timestamp);*/      
    }
    
    /* Copy packet in buffer */
-   if (len>SPEEX_JITTER_MAX_PACKET_SIZE)
-      len=SPEEX_JITTER_MAX_PACKET_SIZE;
-   for (j=0;j<len/BYTES_PER_CHAR;j++)
-      jitter->buf[i][j]=packet[j];
-   jitter->timestamp[i]=timestamp;
-   jitter->len[i]=len;
+   jitter->buf[i]=speex_alloc(packet->len);
+   for (j=0;j<packet->len;j++)
+      jitter->buf[i][j]=packet->data[j];
+   jitter->timestamp[i]=packet->timestamp;
+   jitter->span[i]=packet->span;
+   jitter->len[i]=packet->len;
    
-   /* Don't count late packets when adjusting the synchro (we're taking care of them elsewhere) */
-   /*if (timestamp <= jitter->pointer_timestamp)
-   {
-      fprintf (stderr, "frame for timestamp %d arrived too late (at time %d)\n", timestamp, jitter->pointer_timestamp);
-   }*/
-
    /* Adjust the buffer size depending on network conditions */
-   arrival_margin = (timestamp - jitter->pointer_timestamp - jitter->frame_time);
+   arrival_margin = (packet->timestamp - jitter->current_timestamp) - jitter->buffer_margin*jitter->tick_size;
    
-   if (arrival_margin >= -LATE_BINS*jitter->frame_time)
+   if (arrival_margin >= -LATE_BINS*jitter->tick_size)
    {
-      int int_margin;
+      spx_int32_t int_margin;
       for (i=0;i<MAX_MARGIN;i++)
       {
          jitter->shortterm_margin[i] *= .98;
          jitter->longterm_margin[i] *= .995;
       }
-      int_margin = (arrival_margin + LATE_BINS*jitter->frame_time)/jitter->frame_time;
+      int_margin = LATE_BINS + arrival_margin/jitter->tick_size;
       if (int_margin>MAX_MARGIN-1)
          int_margin = MAX_MARGIN-1;
       if (int_margin>=0)
@@ -171,21 +206,52 @@
          jitter->shortterm_margin[int_margin] += .02;
          jitter->longterm_margin[int_margin] += .005;
       }
+   } else {
+      
+      /*fprintf (stderr, "way too late = %d\n", arrival_margin);*/
+      if (jitter->lost_count>20)
+      {
+         jitter_buffer_reset(jitter);
+      }
    }
-   
-   /*fprintf (stderr, "margin : %d %d %f %f %f %f\n", arrival_margin, jitter->buffer_size, 100*jitter->loss_rate, 100*jitter->late_ratio, 100*jitter->ontime_ratio, 100*jitter->early_ratio);*/
+#if 0 /* Enable to check how much is being buffered */
+   if (rand()%1000==0)
+   {
+      int count = 0;
+      for (j=0;j<SPEEX_JITTER_MAX_BUFFER_SIZE;j++)
+      {
+         if (jitter->buf[j])
+            count++;
+      }
+      fprintf (stderr, "buffer_size = %d\n", count);
+   }
+#endif
 }
 
-void speex_jitter_get(SpeexJitter *jitter, short *out, int *current_timestamp)
+/** Get one packet from the jitter buffer */
+int jitter_buffer_get(JitterBuffer *jitter, JitterBufferPacket *packet, spx_uint32_t *start_offset)
 {
-   int i;
-   int ret;
+   int i, j;
    float late_ratio_short;
    float late_ratio_long;
    float ontime_ratio_short;
    float ontime_ratio_long;
    float early_ratio_short;
    float early_ratio_long;
+   int chunk_size;
+   int incomplete = 0;
+   
+   if (LT32(jitter->current_timestamp+jitter->tick_size, jitter->pointer_timestamp))
+   {
+      jitter->current_timestamp = jitter->pointer_timestamp;
+      speex_warning("did you forget to call jitter_buffer_tick() by any chance?");
+   }
+   /*fprintf (stderr, "get packet %d %d\n", jitter->pointer_timestamp, jitter->current_timestamp);*/
+
+   /* FIXME: This should be only what remaining of the current tick */
+   chunk_size = jitter->tick_size;
+   
+   /* Compiling arrival statistics */
    
    late_ratio_short = 0;
    late_ratio_long = 0;
@@ -204,12 +270,15 @@
    }
    if (0&&jitter->pointer_timestamp%1000==0)
    {
-      fprintf (stderr, "%f %f %f %f %f %f\n", early_ratio_short, early_ratio_long, ontime_ratio_short, ontime_ratio_long, late_ratio_short, late_ratio_long);
+      /*fprintf (stderr, "%f %f %f %f %f %f\n", early_ratio_short, early_ratio_long, ontime_ratio_short, ontime_ratio_long, late_ratio_short, late_ratio_long);*/
       /*fprintf (stderr, "%f %f\n", early_ratio_short + ontime_ratio_short + late_ratio_short, early_ratio_long + ontime_ratio_long + late_ratio_long);*/
    }
    
+   /* Adjusting the buffering */
+   
    if (late_ratio_short > .1 || late_ratio_long > .03)
    {
+      /* If too many packets are arriving late */
       jitter->shortterm_margin[MAX_MARGIN-1] += jitter->shortterm_margin[MAX_MARGIN-2];
       jitter->longterm_margin[MAX_MARGIN-1] += jitter->longterm_margin[MAX_MARGIN-2];
       for (i=MAX_MARGIN-3;i>=0;i--)
@@ -219,18 +288,13 @@
       }
       jitter->shortterm_margin[0] = 0;
       jitter->longterm_margin[0] = 0;            
-      /*fprintf (stderr, "interpolate frame\n");*/
-      speex_decode_int(jitter->dec, NULL, (spx_int16_t*)out);
-      if (current_timestamp)
-         *current_timestamp = jitter->pointer_timestamp;
-      return;
-   }
-   
-   /* Increment timestamp */
-   jitter->pointer_timestamp += jitter->frame_time;
-   
-   if (late_ratio_short + ontime_ratio_short < .005 && late_ratio_long + ontime_ratio_long < .01 && early_ratio_short > .8)
+      jitter->pointer_timestamp -= jitter->tick_size;
+      jitter->current_timestamp -= jitter->tick_size;
+      /*fprintf (stderr, "i");*/
+      /*fprintf (stderr, "interpolate (getting some slack)\n");*/
+   } else if (late_ratio_short + ontime_ratio_short < .005 && late_ratio_long + ontime_ratio_long < .01 && early_ratio_short > .8)
    {
+      /* Many frames arriving early */
       jitter->shortterm_margin[0] += jitter->shortterm_margin[1];
       jitter->longterm_margin[0] += jitter->longterm_margin[1];
       for (i=1;i<MAX_MARGIN-1;i++)
@@ -241,61 +305,191 @@
       jitter->shortterm_margin[MAX_MARGIN-1] = 0;
       jitter->longterm_margin[MAX_MARGIN-1] = 0;      
       /*fprintf (stderr, "drop frame\n");*/
-      jitter->pointer_timestamp += jitter->frame_time;
-   }
-
-   if (current_timestamp)
-      *current_timestamp = jitter->pointer_timestamp;
-
-   /* Send zeros while we fill in the buffer */
-   if (jitter->pointer_timestamp<0)
-   {
-      for (i=0;i<jitter->frame_size;i++)
-         out[i]=0;
-      return;
+      /*fprintf (stderr, "d");*/
+      jitter->pointer_timestamp += jitter->tick_size;
+      jitter->current_timestamp += jitter->tick_size;
+      /*fprintf (stderr, "dropping packet (getting more aggressive)\n");*/
    }
    
-   /* Search the buffer for a packet with the right timestamp */
+   /* Searching for the packet that fits best */
+   
+   /* Search the buffer for a packet with the right timestamp and spanning the whole current chunk */
    for (i=0;i<SPEEX_JITTER_MAX_BUFFER_SIZE;i++)
    {
-      if (jitter->len[i]!=-1 && jitter->timestamp[i]==jitter->pointer_timestamp)
+      if (jitter->buf[i] && jitter->timestamp[i]==jitter->pointer_timestamp && GE32(jitter->timestamp[i]+jitter->span[i],jitter->pointer_timestamp+chunk_size))
          break;
    }
    
+   /* If no match, try for an "older" packet that still spans (fully) the current chunk */
    if (i==SPEEX_JITTER_MAX_BUFFER_SIZE)
    {
-      /* No packet found */
-      if (jitter->valid_bits)
+      for (i=0;i<SPEEX_JITTER_MAX_BUFFER_SIZE;i++)
       {
-         /* Try decoding last received packet */
-         ret = speex_decode_int(jitter->dec, &jitter->current_packet, (spx_int16_t*)out);
-         if (ret == 0)
+         if (jitter->buf[i] && jitter->timestamp[i]<=jitter->pointer_timestamp && GE32(jitter->timestamp[i]+jitter->span[i],jitter->pointer_timestamp+chunk_size))
+            break;
+      }
+   }
+   
+   /* If still no match, try for an "older" packet that spans part of the current chunk */
+   if (i==SPEEX_JITTER_MAX_BUFFER_SIZE)
+   {
+      for (i=0;i<SPEEX_JITTER_MAX_BUFFER_SIZE;i++)
+      {
+         if (jitter->buf[i] && jitter->timestamp[i]<=jitter->pointer_timestamp && GT32(jitter->timestamp[i]+jitter->span[i],jitter->pointer_timestamp))
+            break;
+      }
+   }
+   
+   /* If still no match, try for earliest packet possible */
+   if (i==SPEEX_JITTER_MAX_BUFFER_SIZE)
+   {
+      int found = 0;
+      spx_uint32_t best_time=0;
+      int best_span=0;
+      int besti=0;
+      for (i=0;i<SPEEX_JITTER_MAX_BUFFER_SIZE;i++)
+      {
+         /* check if packet starts within current chunk */
+         if (jitter->buf[i] && LT32(jitter->timestamp[i],jitter->pointer_timestamp+chunk_size) && GE32(jitter->timestamp[i],jitter->pointer_timestamp))
          {
-            jitter->lost_count = 0;
-            return;
-         } else {
-            jitter->valid_bits = 0;
+            if (!found || LT32(jitter->timestamp[i],best_time) || (jitter->timestamp[i]==best_time && GT32(jitter->span[i],best_span)))
+            {
+               best_time = jitter->timestamp[i];
+               best_span = jitter->span[i];
+               besti = i;
+               found = 1;
+            }
          }
       }
-
-      /*fprintf (stderr, "lost/late frame %d\n", jitter->pointer_timestamp);*/
-      /*Packet is late or lost*/
-      speex_decode_int(jitter->dec, NULL, (spx_int16_t*)out);
-      jitter->lost_count++;
-      if (jitter->lost_count>=25)
+      if (found)
       {
-         jitter->lost_count = 0;
-         jitter->reset_state = 1;
-         speex_decoder_ctl(jitter->dec, SPEEX_RESET_STATE, NULL);
+         i=besti;
+         incomplete = 1;
+         /*fprintf (stderr, "incomplete: %d %d %d %d\n", jitter->timestamp[i], jitter->pointer_timestamp, chunk_size, jitter->span[i]);*/
       }
-      jitter->loss_rate = .999*jitter->loss_rate + .001;
-   } else {
+   }
+
+   /* If we find something */
+   if (i!=SPEEX_JITTER_MAX_BUFFER_SIZE)
+   {
+      /* We (obviously) haven't lost this packet */
       jitter->lost_count = 0;
-      /* Found the right packet */
-      speex_bits_read_from(&jitter->current_packet, jitter->buf[i], jitter->len[i]);
-      jitter->len[i]=-1;
+      jitter->loss_rate = .999*jitter->loss_rate;
+      /* Check for potential overflow */
+      packet->len = jitter->len[i];
+      /* Copy packet */
+      for (j=0;j<packet->len;j++)
+         packet->data[j] = jitter->buf[i][j];
+      /* Remove packet */
+      speex_free(jitter->buf[i]);
+      jitter->buf[i] = NULL;
+      /* Set timestamp and span (if requested) */
+      if (start_offset)
+         *start_offset = jitter->timestamp[i]-jitter->pointer_timestamp;
+      packet->timestamp = jitter->timestamp[i];
+      packet->span = jitter->span[i];
+      /* Point at the end of the current packet */
+      jitter->pointer_timestamp = jitter->timestamp[i]+jitter->span[i];
+      if (incomplete)
+         return JITTER_BUFFER_INCOMPLETE;
+      else
+         return JITTER_BUFFER_OK;
+   }
+   
+   
+   /* If we haven't found anything worth returning */
+   /*fprintf (stderr, "not found\n");*/
+   jitter->lost_count++;
+   /*fprintf (stderr, "m");*/
+   /*fprintf (stderr, "lost_count = %d\n", jitter->lost_count);*/
+   jitter->loss_rate = .999*jitter->loss_rate + .001;
+   if (start_offset)
+      *start_offset = 0;
+   packet->timestamp = jitter->pointer_timestamp;
+   packet->span = jitter->tick_size;
+   jitter->pointer_timestamp += chunk_size;
+   packet->len = 0;
+   return JITTER_BUFFER_MISSING;
+
+}
+
+/** Get pointer timestamp of jitter buffer */
+int jitter_buffer_get_pointer_timestamp(JitterBuffer *jitter)
+{
+   return jitter->pointer_timestamp;
+}
+
+void jitter_buffer_tick(JitterBuffer *jitter)
+{
+   jitter->current_timestamp += jitter->tick_size;
+}
+
+
+
+
+
+void speex_jitter_init(SpeexJitter *jitter, void *decoder, int sampling_rate)
+{
+   jitter->dec = decoder;
+   speex_decoder_ctl(decoder, SPEEX_GET_FRAME_SIZE, &jitter->frame_size);
+
+   jitter->packets = jitter_buffer_init(jitter->frame_size);
+
+   speex_bits_init(&jitter->current_packet);
+   jitter->valid_bits = 0;
+
+}
+
+void speex_jitter_destroy(SpeexJitter *jitter)
+{
+   jitter_buffer_destroy(jitter->packets);
+   speex_bits_destroy(&jitter->current_packet);
+}
+
+void speex_jitter_put(SpeexJitter *jitter, char *packet, int len, int timestamp)
+{
+   JitterBufferPacket p;
+   p.data = packet;
+   p.len = len;
+   p.timestamp = timestamp;
+   p.span = jitter->frame_size;
+   jitter_buffer_put(jitter->packets, &p);
+}
+
+void speex_jitter_get(SpeexJitter *jitter, short *out, int *current_timestamp)
+{
+   int i;
+   int ret;
+   char data[2048];
+   JitterBufferPacket packet;
+   packet.data = data;
+   
+   if (jitter->valid_bits)
+   {
+      /* Try decoding last received packet */
+      ret = speex_decode_int(jitter->dec, &jitter->current_packet, out);
+      if (ret == 0)
+      {
+         jitter_buffer_tick(jitter->packets);
+         return;
+      } else {
+         jitter->valid_bits = 0;
+      }
+   }
+
+   ret = jitter_buffer_get(jitter->packets, &packet, NULL);
+   
+   if (ret != JITTER_BUFFER_OK)
+   {
+      /* No packet found */
+
+      /*fprintf (stderr, "lost/late frame\n");*/
+      /*Packet is late or lost*/
+      speex_decode_int(jitter->dec, NULL, out);
+   } else {
+      speex_bits_read_from(&jitter->current_packet, packet.data, packet.len);
       /* Decode packet */
-      ret = speex_decode_int(jitter->dec, &jitter->current_packet, (spx_int16_t*)out);
+      ret = speex_decode_int(jitter->dec, &jitter->current_packet, out);
       if (ret == 0)
       {
          jitter->valid_bits = 1;
@@ -304,13 +498,11 @@
          for (i=0;i<jitter->frame_size;i++)
             out[i]=0;
       }
-      jitter->loss_rate = .999*jitter->loss_rate;
    }
-
-
+   jitter_buffer_tick(jitter->packets);
 }
 
 int speex_jitter_get_pointer_timestamp(SpeexJitter *jitter)
 {
-   return jitter->pointer_timestamp;
+   return jitter_buffer_get_pointer_timestamp(jitter->packets);
 }
diff --git a/pjmedia/src/pjmedia-codec/speex/kiss_fft.c b/pjmedia/src/pjmedia-codec/speex/kiss_fft.c
index bea55ee..a0b3724 100644
--- a/pjmedia/src/pjmedia-codec/speex/kiss_fft.c
+++ b/pjmedia/src/pjmedia-codec/speex/kiss_fft.c
@@ -32,7 +32,7 @@
 #define CHECKBUF(buf,nbuf,n) \
     do { \
         if ( nbuf < (size_t)(n) ) {\
-            free(buf); \
+            speex_free(buf); \
             buf = (kiss_fft_cpx*)KISS_FFT_MALLOC(sizeof(kiss_fft_cpx)*(n)); \
             nbuf = (size_t)(n); \
         } \
@@ -87,7 +87,7 @@
     if (!st->inverse) {
        int i;
        kiss_fft_cpx *x=Fout;
-       for (i=0;i<(int)(4*m);i++)
+       for (i=0;i<4*m;i++)
        {
           x[i].r = PSHR16(x[i].r,2);
           x[i].i = PSHR16(x[i].i,2);
@@ -404,7 +404,7 @@
     if (fin == fout) {
         CHECKBUF(tmpbuf,ntmpbuf,st->nfft);
         kf_work(tmpbuf,fin,1,in_stride, st->factors,st);
-        memcpy(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft);
+        speex_move(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft);
     }else{
         kf_work( fout, fin, 1,in_stride, st->factors,st );
     }
@@ -421,10 +421,10 @@
  */ 
 void kiss_fft_cleanup(void)
 {
-    free(scratchbuf);
+    speex_free(scratchbuf);
     scratchbuf = NULL;
     nscratchbuf=0;
-    free(tmpbuf);
+    speex_free(tmpbuf);
     tmpbuf=NULL;
     ntmpbuf=0;
 }
diff --git a/pjmedia/src/pjmedia-codec/speex/kiss_fft.h b/pjmedia/src/pjmedia-codec/speex/kiss_fft.h
index d07b78b..54627e7 100644
--- a/pjmedia/src/pjmedia-codec/speex/kiss_fft.h
+++ b/pjmedia/src/pjmedia-codec/speex/kiss_fft.h
@@ -2,11 +2,8 @@
 #define KISS_FFT_H
 
 #include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
-#include <memory.h>
-//Not available in gcc MacOS X (bennylp)
-//#include <malloc.h>
+#include "misc.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -30,13 +27,13 @@
 # define kiss_fft_scalar __m128
 #define KISS_FFT_MALLOC(nbytes) memalign(16,nbytes)
 #else	
-#define KISS_FFT_MALLOC malloc
+#define KISS_FFT_MALLOC speex_alloc
 #endif	
 
 
 #ifdef FIXED_POINT
-#include <sys/types.h>	
-#  define kiss_fft_scalar int16_t
+#include "misc.h"	
+#  define kiss_fft_scalar spx_int16_t
 #else
 # ifndef kiss_fft_scalar
 /*  default is float */
@@ -95,7 +92,7 @@
 
 /* If kiss_fft_alloc allocated a buffer, it is one contiguous 
    buffer and can be simply free()d when no longer needed*/
-#define kiss_fft_free free
+#define kiss_fft_free speex_free
 
 /*
  Cleans up some memory that gets managed internally. Not necessary to call, but it might clean up 
diff --git a/pjmedia/src/pjmedia-codec/speex/kiss_fftr.c b/pjmedia/src/pjmedia-codec/speex/kiss_fftr.c
index 3ac4db9..b90b725 100644
--- a/pjmedia/src/pjmedia-codec/speex/kiss_fftr.c
+++ b/pjmedia/src/pjmedia-codec/speex/kiss_fftr.c
@@ -35,7 +35,7 @@
     size_t subsize, memneeded;
 
     if (nfft & 1) {
-        fprintf(stderr,"Real FFT optimization must be even.\n");
+        speex_warning("Real FFT optimization must be even.\n");
         return NULL;
     }
     nfft >>= 1;
@@ -75,7 +75,7 @@
     kiss_fft_cpx fpnk,fpk,f1k,f2k,tw,tdc;
 
     if ( st->substate->inverse) {
-        fprintf(stderr,"kiss fft usage error: improper alloc\n");
+        speex_warning("kiss fft usage error: improper alloc\n");
         exit(1);
     }
 
@@ -130,7 +130,7 @@
     int k, ncfft;
 
     if (st->substate->inverse == 0) {
-        fprintf (stderr, "kiss fft usage error: improper alloc\n");
+        speex_warning ("kiss fft usage error: improper alloc\n");
         exit (1);
     }
 
diff --git a/pjmedia/src/pjmedia-codec/speex/kiss_fftr.h b/pjmedia/src/pjmedia-codec/speex/kiss_fftr.h
index 72e5a57..2e8351a 100644
--- a/pjmedia/src/pjmedia-codec/speex/kiss_fftr.h
+++ b/pjmedia/src/pjmedia-codec/speex/kiss_fftr.h
@@ -38,7 +38,7 @@
  output timedata has nfft scalar points
 */
 
-#define kiss_fftr_free free
+#define kiss_fftr_free speex_free
 
 #ifdef __cplusplus
 }
diff --git a/pjmedia/src/pjmedia-codec/speex/lpc.c b/pjmedia/src/pjmedia-codec/speex/lpc.c
index c465fae..fd5d382 100644
--- a/pjmedia/src/pjmedia-codec/speex/lpc.c
+++ b/pjmedia/src/pjmedia-codec/speex/lpc.c
@@ -94,7 +94,7 @@
       for (j = 0; j < i; j++) 
          rr = SUB32(rr,MULT16_16(lpc[j],ac[i - j]));
 #ifdef FIXED_POINT
-      r = DIV32_16(rr,ADD16(error,16));
+      r = DIV32_16(rr+PSHR32(error,1),ADD16(error,8));
 #else
       r = rr/(error+.003*ac[0]);
 #endif
@@ -103,11 +103,11 @@
       for (j = 0; j < i>>1; j++) 
       {
          spx_word16_t tmp  = lpc[j];
-         lpc[j]     = MAC16_16_Q13(lpc[j],r,lpc[i-1-j]);
-         lpc[i-1-j] = MAC16_16_Q13(lpc[i-1-j],r,tmp);
+         lpc[j]     = MAC16_16_P13(lpc[j],r,lpc[i-1-j]);
+         lpc[i-1-j] = MAC16_16_P13(lpc[i-1-j],r,tmp);
       }
       if (i & 1) 
-         lpc[j] = MAC16_16_Q13(lpc[j],lpc[j],r);
+         lpc[j] = MAC16_16_P13(lpc[j],lpc[j],r);
 
       error = SUB16(error,MULT16_16_Q13(r,MULT16_16_Q13(error,r)));
    }
diff --git a/pjmedia/src/pjmedia-codec/speex/lsp.c b/pjmedia/src/pjmedia-codec/speex/lsp.c
index f4350ae..6e7ea31 100644
--- a/pjmedia/src/pjmedia-codec/speex/lsp.c
+++ b/pjmedia/src/pjmedia-codec/speex/lsp.c
@@ -1,8 +1,6 @@
 /*---------------------------------------------------------------------------*\
 Original copyright
-	FILE........: AKSLSPD.C
-	TYPE........: Turbo C
-	COMPANY.....: Voicetronix
+	FILE........: lsp.c
 	AUTHOR......: David Rowe
 	DATE CREATED: 24/2/93
 
@@ -44,6 +42,43 @@
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+/*---------------------------------------------------------------------------*\
+
+  Introduction to Line Spectrum Pairs (LSPs)
+  ------------------------------------------
+
+  LSPs are used to encode the LPC filter coefficients {ak} for
+  transmission over the channel.  LSPs have several properties (like
+  less sensitivity to quantisation noise) that make them superior to
+  direct quantisation of {ak}.
+
+  A(z) is a polynomial of order lpcrdr with {ak} as the coefficients.
+
+  A(z) is transformed to P(z) and Q(z) (using a substitution and some
+  algebra), to obtain something like:
+
+    A(z) = 0.5[P(z)(z+z^-1) + Q(z)(z-z^-1)]  (1)
+
+  As you can imagine A(z) has complex zeros all over the z-plane. P(z)
+  and Q(z) have the very neat property of only having zeros _on_ the
+  unit circle.  So to find them we take a test point z=exp(jw) and
+  evaluate P (exp(jw)) and Q(exp(jw)) using a grid of points between 0
+  and pi.
+
+  The zeros (roots) of P(z) also happen to alternate, which is why we
+  swap coefficients as we find roots.  So the process of finding the
+  LSP frequencies is basically finding the roots of 5th order
+  polynomials.
+
+  The root so P(z) and Q(z) occur in symmetrical pairs at +/-w, hence
+  the name Line Spectrum Pairs (LSPs).
+
+  To convert back to ak we just evaluate (1), "clocking" an impulse
+  thru it lpcrdr times gives us the impulse response of A(z) which is
+  {ak}.
+
+\*---------------------------------------------------------------------------*/
+
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
@@ -63,8 +98,6 @@
 
 #ifdef FIXED_POINT
 
-
-
 #define FREQ_SCALE 16384
 
 /*#define ANGLE2X(a) (32768*cos(((a)/8192.)))*/
@@ -73,6 +106,10 @@
 /*#define X2ANGLE(x) (acos(.00006103515625*(x))*LSP_SCALING)*/
 #define X2ANGLE(x) (spx_acos(x))
 
+#ifdef BFIN_ASM
+#include "lsp_bfin.h"
+#endif
+
 #else
 
 /*#define C1 0.99940307
@@ -88,27 +125,28 @@
 
 /*---------------------------------------------------------------------------*\
 
-	FUNCTION....: cheb_poly_eva()
+   FUNCTION....: cheb_poly_eva()
 
-	AUTHOR......: David Rowe
-	DATE CREATED: 24/2/93
+   AUTHOR......: David Rowe
+   DATE CREATED: 24/2/93
 
-    This function evaluates a series of Chebyshev polynomials
+   This function evaluates a series of Chebyshev polynomials
 
 \*---------------------------------------------------------------------------*/
 
 #ifdef FIXED_POINT
 
-static inline spx_word32_t cheb_poly_eva(spx_word32_t *coef,spx_word16_t x,int m,char *stack)
-/*  float coef[]  	coefficients of the polynomial to be evaluated 	*/
-/*  float x   		the point where polynomial is to be evaluated 	*/
-/*  int m 		order of the polynomial 			*/
+#ifndef OVERRIDE_CHEB_POLY_EVA
+static inline spx_word32_t cheb_poly_eva(
+  spx_word16_t *coef, /* P or Q coefs in Q13 format               */
+  spx_word16_t     x, /* cos of freq (-1.0 to 1.0) in Q14 format  */
+  int              m, /* LPC order/2                              */
+  char         *stack
+)
 {
     int i;
-    VARDECL(spx_word16_t *T);
+    spx_word16_t b0, b1;
     spx_word32_t sum;
-    int m2=m>>1;
-    VARDECL(spx_word16_t *coefn);
 
     /*Prevents overflows*/
     if (x>16383)
@@ -116,73 +154,55 @@
     if (x<-16383)
        x = -16383;
 
-    /* Allocate memory for Chebyshev series formulation */
-    ALLOC(T, m2+1, spx_word16_t);
-    ALLOC(coefn, m2+1, spx_word16_t);
-
-    for (i=0;i<m2+1;i++)
-    {
-       coefn[i] = coef[i];
-       /*printf ("%f ", coef[i]);*/
-    }
-    /*printf ("\n");*/
-
     /* Initialise values */
-    T[0]=16384;
-    T[1]=x;
+    b1=16384;
+    b0=x;
 
-    /* Evaluate Chebyshev series formulation using iterative approach  */
-    /* Evaluate polynomial and return value also free memory space */
-    sum = ADD32(EXTEND32(coefn[m2]), EXTEND32(MULT16_16_P14(coefn[m2-1],x)));
-    /*x *= 2;*/
-    for(i=2;i<=m2;i++)
+    /* Evaluate Chebyshev series formulation usin g iterative approach  */
+    sum = ADD32(EXTEND32(coef[m]), EXTEND32(MULT16_16_P14(coef[m-1],x)));
+    for(i=2;i<=m;i++)
     {
-       T[i] = SUB16(MULT16_16_Q13(x,T[i-1]), T[i-2]);
-       sum = ADD32(sum, EXTEND32(MULT16_16_P14(coefn[m2-i],T[i])));
-       /*printf ("%f ", sum);*/
-    }
-    
-    /*printf ("\n");*/
-    return sum;
-}
-#else
-static float cheb_poly_eva(spx_word32_t *coef,float x,int m,char *stack)
-/*  float coef[]  	coefficients of the polynomial to be evaluated 	*/
-/*  float x   		the point where polynomial is to be evaluated 	*/
-/*  int m 		order of the polynomial 			*/
-{
-    int i;
-    VARDECL(float *T);
-    float sum;
-    int m2=m>>1;
-
-    /* Allocate memory for Chebyshev series formulation */
-    ALLOC(T, m2+1, float);
-
-    /* Initialise values */
-    T[0]=1;
-    T[1]=x;
-
-    /* Evaluate Chebyshev series formulation using iterative approach  */
-    /* Evaluate polynomial and return value also free memory space */
-    sum = coef[m2] + coef[m2-1]*x;
-    x *= 2;
-    for(i=2;i<=m2;i++)
-    {
-       T[i] = x*T[i-1] - T[i-2];
-       sum += coef[m2-i] * T[i];
+       spx_word16_t tmp=b0;
+       b0 = SUB16(MULT16_16_Q13(x,b0), b1);
+       b1 = tmp;
+       sum = ADD32(sum, EXTEND32(MULT16_16_P14(coef[m-i],b0)));
     }
     
     return sum;
 }
 #endif
 
+#else
+
+static float cheb_poly_eva(spx_word32_t *coef, spx_word16_t x, int m, char *stack)
+{
+   int k;
+   float b0, b1, tmp;
+
+   /* Initial conditions */
+   b0=0; /* b_(m+1) */
+   b1=0; /* b_(m+2) */
+
+   x*=2;
+
+   /* Calculate the b_(k) */
+   for(k=m;k>0;k--)
+   {
+      tmp=b0;                           /* tmp holds the previous value of b0 */
+      b0=x*b0-b1+coef[m-k];    /* b0 holds its new value based on b0 and b1 */
+      b1=tmp;                           /* b1 holds the previous value of b0 */
+   }
+
+   return(-b1+.5*x*b0+coef[m]);
+}
+#endif
+
 /*---------------------------------------------------------------------------*\
 
-	FUNCTION....: lpc_to_lsp()
+    FUNCTION....: lpc_to_lsp()
 
-	AUTHOR......: David Rowe
-	DATE CREATED: 24/2/93
+    AUTHOR......: David Rowe
+    DATE CREATED: 24/2/93
 
     This function converts LPC coefficients to LSP
     coefficients.
@@ -210,11 +230,13 @@
     int i,j,m,flag,k;
     VARDECL(spx_word32_t *Q);                 	/* ptrs for memory allocation 		*/
     VARDECL(spx_word32_t *P);
+    VARDECL(spx_word16_t *Q16);         /* ptrs for memory allocation 		*/
+    VARDECL(spx_word16_t *P16);
     spx_word32_t *px;                	/* ptrs of respective P'(z) & Q'(z)	*/
     spx_word32_t *qx;
     spx_word32_t *p;
     spx_word32_t *q;
-    spx_word32_t *pt;                	/* ptr used for cheb_poly_eval()
+    spx_word16_t *pt;                	/* ptr used for cheb_poly_eval()
 				whether P' or Q' 			*/
     int roots=0;              	/* DR 8/2/94: number of roots found 	*/
     flag = 1;                	/*  program is searching for a root when,
@@ -276,20 +298,31 @@
     px = P;             	/* re-initialise ptrs 			*/
     qx = Q;
 
+    /* now that we have computed P and Q convert to 16 bits to
+       speed up cheb_poly_eval */
+
+    ALLOC(P16, m+1, spx_word16_t);
+    ALLOC(Q16, m+1, spx_word16_t);
+
+    for (i=0;i<m+1;i++)
+    {
+       P16[i] = P[i];
+       Q16[i] = Q[i];
+    }
+
     /* Search for a zero in P'(z) polynomial first and then alternate to Q'(z).
     Keep alternating between the two polynomials as each zero is found 	*/
 
     xr = 0;             	/* initialise xr to zero 		*/
     xl = FREQ_SCALE;               	/* start at point xl = 1 		*/
 
-
     for(j=0;j<lpcrdr;j++){
 	if(j&1)            	/* determines whether P' or Q' is eval. */
-	    pt = qx;
+	    pt = Q16;
 	else
-	    pt = px;
+	    pt = P16;
 
-	psuml = cheb_poly_eva(pt,xl,lpcrdr,stack);	/* evals poly. at xl 	*/
+	psuml = cheb_poly_eva(pt,xl,m,stack);	/* evals poly. at xl 	*/
 	flag = 1;
 	while(flag && (xr >= -FREQ_SCALE)){
            spx_word16_t dd;
@@ -304,7 +337,7 @@
               dd *= .5;
 #endif
            xr = SUB16(xl, dd);                        	/* interval spacing 	*/
-	    psumr = cheb_poly_eva(pt,xr,lpcrdr,stack);/* poly(xl-delta_x) 	*/
+	    psumr = cheb_poly_eva(pt,xr,m,stack);/* poly(xl-delta_x) 	*/
 	    temp_psumr = psumr;
 	    temp_xr = xr;
 
@@ -328,7 +361,7 @@
 #else
                     xm = .5*(xl+xr);        	/* bisect the interval 	*/
 #endif
-		    psumm=cheb_poly_eva(pt,xm,lpcrdr,stack);
+		    psumm=cheb_poly_eva(pt,xm,m,stack);
 		    /*if(psumm*psuml>0.)*/
 		    if(!SIGN_CHANGE(psumm,psuml))
                     {
@@ -354,7 +387,6 @@
     return(roots);
 }
 
-
 /*---------------------------------------------------------------------------*\
 
 	FUNCTION....: lsp_to_lpc()
@@ -362,8 +394,7 @@
 	AUTHOR......: David Rowe
 	DATE CREATED: 24/2/93
 
-    lsp_to_lpc: This function converts LSP coefficients to LPC
-    coefficients.
+        Converts LSP coefficients to LPC coefficients.
 
 \*---------------------------------------------------------------------------*/
 
@@ -373,77 +404,119 @@
 /*  float *freq 	array of LSP frequencies in the x domain	*/
 /*  float *ak 		array of LPC coefficients 			*/
 /*  int lpcrdr  	order of LPC coefficients 			*/
-
-
 {
     int i,j;
-    spx_word32_t xout1,xout2,xin1,xin2;
-    VARDECL(spx_word32_t *Wp);
-    spx_word32_t *pw,*n1,*n2,*n3,*n4=NULL;
+    spx_word32_t xout1,xout2,xin;
+    spx_word32_t mult, a;
     VARDECL(spx_word16_t *freqn);
+    VARDECL(spx_word32_t **xp);
+    VARDECL(spx_word32_t *xpmem);
+    VARDECL(spx_word32_t **xq);
+    VARDECL(spx_word32_t *xqmem);
     int m = lpcrdr>>1;
+
+    /* 
     
+       Reconstruct P(z) and Q(z) by cascading second order polynomials
+       in form 1 - 2cos(w)z(-1) + z(-2), where w is the LSP frequency.
+       In the time domain this is:
+
+       y(n) = x(n) - 2cos(w)x(n-1) + x(n-2)
+    
+       This is what the ALLOCS below are trying to do:
+
+         int xp[m+1][lpcrdr+1+2]; // P matrix in QIMP
+         int xq[m+1][lpcrdr+1+2]; // Q matrix in QIMP
+
+       These matrices store the output of each stage on each row.  The
+       final (m-th) row has the output of the final (m-th) cascaded
+       2nd order filter.  The first row is the impulse input to the
+       system (not written as it is known).
+
+       The version below takes advantage of the fact that a lot of the
+       outputs are zero or known, for example if we put an inpulse
+       into the first section the "clock" it 10 times only the first 3
+       outputs samples are non-zero (it's an FIR filter).
+    */
+
+    ALLOC(xp, (m+1), spx_word32_t*);
+    ALLOC(xpmem, (m+1)*(lpcrdr+1+2), spx_word32_t);
+
+    ALLOC(xq, (m+1), spx_word32_t*);
+    ALLOC(xqmem, (m+1)*(lpcrdr+1+2), spx_word32_t);
+    
+    for(i=0; i<=m; i++) {
+      xp[i] = xpmem + i*(lpcrdr+1+2);
+      xq[i] = xqmem + i*(lpcrdr+1+2);
+    }
+
+    /* work out 2cos terms in Q14 */
+
     ALLOC(freqn, lpcrdr, spx_word16_t);
-    for (i=0;i<lpcrdr;i++)
+    for (i=0;i<lpcrdr;i++) 
        freqn[i] = ANGLE2X(freq[i]);
 
-    ALLOC(Wp, 4*m+2, spx_word32_t);
-    pw = Wp;
+    #define QIMP  21   /* scaling for impulse */
 
-
-    /* initialise contents of array */
-
-    for(i=0;i<=4*m+1;i++){       	/* set contents of buffer to 0 */
-	*pw++ = 0;
+    xin = SHL32(EXTEND32(1), (QIMP-1)); /* 0.5 in QIMP format */
+   
+    /* first col and last non-zero values of each row are trivial */
+    
+    for(i=0;i<=m;i++) {
+     xp[i][1] = 0;
+     xp[i][2] = xin;
+     xp[i][2+2*i] = xin;
+     xq[i][1] = 0;
+     xq[i][2] = xin;
+     xq[i][2+2*i] = xin;
     }
 
-    /* Set pointers up */
+    /* 2nd row (first output row) is trivial */
 
-    pw = Wp;
-    xin1 = 1048576;
-    xin2 = 1048576;
+    xp[1][3] = -MULT16_32_Q14(freqn[0],xp[0][2]);
+    xq[1][3] = -MULT16_32_Q14(freqn[1],xq[0][2]);
 
-    /* reconstruct P(z) and Q(z) by  cascading second order
-      polynomials in form 1 - 2xz(-1) +z(-2), where x is the
-      LSP coefficient */
+    xout1 = xout2 = 0;
 
-    for(j=0;j<=lpcrdr;j++){
-       spx_word16_t *fr=freqn;
-	for(i=0;i<m;i++){
-	    n1 = pw+(i<<2);
-	    n2 = n1 + 1;
-	    n3 = n2 + 1;
-	    n4 = n3 + 1;
-	    xout1 = ADD32(SUB32(xin1, MULT16_32_Q14(*fr,*n1)), *n2);
-            fr++;
-            xout2 = ADD32(SUB32(xin2, MULT16_32_Q14(*fr,*n3)), *n4);
-            fr++;
-	    *n2 = *n1;
-	    *n4 = *n3;
-	    *n1 = xin1;
-	    *n3 = xin2;
-	    xin1 = xout1;
-	    xin2 = xout2;
-	}
-	xout1 = xin1 + *(n4+1);
-	xout2 = xin2 - *(n4+2);
-        /* FIXME: perhaps apply bandwidth expansion in case of overflow? */
-	if (j>0)
-	{
-        if (xout1 + xout2>SHL32(EXTEND32(32766),8))
-           ak[j-1] = 32767;
-        else if (xout1 + xout2 < -SHL32(EXTEND32(32766),8))
-           ak[j-1] = -32767;
-        else
-           ak[j-1] = EXTRACT16(PSHR32(ADD32(xout1,xout2),8));
-	} else {/*speex_warning_int("ak[0] = ", EXTRACT16(PSHR32(ADD32(xout1,xout2),8)));*/}
-	*(n4+1) = xin1;
-	*(n4+2) = xin2;
+    /* now generate remaining rows */
 
-	xin1 = 0;
-	xin2 = 0;
+    for(i=1;i<m;i++) {
+
+      for(j=1;j<2*(i+1)-1;j++) {
+	mult = MULT16_32_Q14(freqn[2*i],xp[i][j+1]);
+	xp[i+1][j+2] = ADD32(SUB32(xp[i][j+2], mult), xp[i][j]);
+	mult = MULT16_32_Q14(freqn[2*i+1],xq[i][j+1]);
+	xq[i+1][j+2] = ADD32(SUB32(xq[i][j+2], mult), xq[i][j]);
+      }
+
+      /* for last col xp[i][j+2] = xq[i][j+2] = 0 */
+
+      mult = MULT16_32_Q14(freqn[2*i],xp[i][j+1]);
+      xp[i+1][j+2] = SUB32(xp[i][j], mult);
+      mult = MULT16_32_Q14(freqn[2*i+1],xq[i][j+1]);
+      xq[i+1][j+2] = SUB32(xq[i][j], mult);
     }
+
+    /* process last row to extra a{k} */
+
+    for(j=1;j<=lpcrdr;j++) {
+      int shift = QIMP-13;
+
+      /* final filter sections */
+      a = PSHR32(xp[m][j+2] + xout1 + xq[m][j+2] - xout2, shift); 
+      xout1 = xp[m][j+2];
+      xout2 = xq[m][j+2];
+      
+      /* hard limit ak's to +/- 32767 */
+
+      if (a < -32767) a = 32767;
+      if (a > 32767) a = 32767;
+      ak[j-1] = (short)a;
+     
+    }
+
 }
+
 #else
 
 void lsp_to_lpc(spx_lsp_t *freq,spx_coef_t *ak,int lpcrdr, char *stack)
diff --git a/pjmedia/src/pjmedia-codec/speex/ltp.c b/pjmedia/src/pjmedia-codec/speex/ltp.c
index 94189c3..9a5a295 100644
--- a/pjmedia/src/pjmedia-codec/speex/ltp.c
+++ b/pjmedia/src/pjmedia-codec/speex/ltp.c
@@ -55,7 +55,7 @@
 #endif
 
 #ifndef OVERRIDE_INNER_PROD
-static spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
+spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
 {
    spx_word32_t sum=0;
    len >>= 2;
@@ -75,7 +75,7 @@
 
 #ifndef OVERRIDE_PITCH_XCORR
 #if 0 /* HINT: Enable this for machines with enough registers (i.e. not x86) */
-static void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack)
+void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack)
 {
    int i,j;
    for (i=0;i<nb_pitch;i+=4)
@@ -138,7 +138,7 @@
 
 }
 #else
-static void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack)
+void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack)
 {
    int i;
    for (i=0;i<nb_pitch;i++)
@@ -152,128 +152,122 @@
 #endif
 
 #ifndef OVERRIDE_COMPUTE_PITCH_ERROR
-static inline spx_word32_t compute_pitch_error(spx_word32_t *C, spx_word16_t *g, spx_word16_t pitch_control)
+static inline spx_word32_t compute_pitch_error(spx_word16_t *C, spx_word16_t *g, spx_word16_t pitch_control)
 {
    spx_word32_t sum = 0;
-   sum = ADD32(sum,MULT16_32_Q15(MULT16_16_16(g[0],pitch_control),C[0]));
-   sum = ADD32(sum,MULT16_32_Q15(MULT16_16_16(g[1],pitch_control),C[1]));
-   sum = ADD32(sum,MULT16_32_Q15(MULT16_16_16(g[2],pitch_control),C[2]));
-   sum = SUB32(sum,MULT16_32_Q15(MULT16_16_16(g[0],g[1]),C[3]));
-   sum = SUB32(sum,MULT16_32_Q15(MULT16_16_16(g[2],g[1]),C[4]));
-   sum = SUB32(sum,MULT16_32_Q15(MULT16_16_16(g[2],g[0]),C[5]));
-   sum = SUB32(sum,MULT16_32_Q15(MULT16_16_16(g[0],g[0]),C[6]));
-   sum = SUB32(sum,MULT16_32_Q15(MULT16_16_16(g[1],g[1]),C[7]));
-   sum = SUB32(sum,MULT16_32_Q15(MULT16_16_16(g[2],g[2]),C[8]));
+   sum = ADD32(sum,MULT16_16(MULT16_16_16(g[0],pitch_control),C[0]));
+   sum = ADD32(sum,MULT16_16(MULT16_16_16(g[1],pitch_control),C[1]));
+   sum = ADD32(sum,MULT16_16(MULT16_16_16(g[2],pitch_control),C[2]));
+   sum = SUB32(sum,MULT16_16(MULT16_16_16(g[0],g[1]),C[3]));
+   sum = SUB32(sum,MULT16_16(MULT16_16_16(g[2],g[1]),C[4]));
+   sum = SUB32(sum,MULT16_16(MULT16_16_16(g[2],g[0]),C[5]));
+   sum = SUB32(sum,MULT16_16(MULT16_16_16(g[0],g[0]),C[6]));
+   sum = SUB32(sum,MULT16_16(MULT16_16_16(g[1],g[1]),C[7]));
+   sum = SUB32(sum,MULT16_16(MULT16_16_16(g[2],g[2]),C[8]));
    return sum;
 }
 #endif
 
-void open_loop_nbest_pitch(spx_sig_t *sw, int start, int end, int len, int *pitch, spx_word16_t *gain, int N, char *stack)
+#ifndef OVERRIDE_OPEN_LOOP_NBEST_PITCH
+void open_loop_nbest_pitch(spx_word16_t *sw, int start, int end, int len, int *pitch, spx_word16_t *gain, int N, char *stack)
 {
    int i,j,k;
    VARDECL(spx_word32_t *best_score);
+   VARDECL(spx_word32_t *best_ener);
    spx_word32_t e0;
    VARDECL(spx_word32_t *corr);
    VARDECL(spx_word32_t *energy);
-   VARDECL(spx_word32_t *score);
-   VARDECL(spx_word16_t *swn2);
-   spx_word16_t *swn;
 
    ALLOC(best_score, N, spx_word32_t);
+   ALLOC(best_ener, N, spx_word32_t);
    ALLOC(corr, end-start+1, spx_word32_t);
    ALLOC(energy, end-start+2, spx_word32_t);
-   ALLOC(score, end-start+1, spx_word32_t);
-
-#ifdef FIXED_POINT
-   ALLOC(swn2, end+len, spx_word16_t);
-   normalize16(sw-end, swn2, 16384, end+len);
-   swn = swn2 + end;
-#else
-   swn = sw;
-#endif
 
    for (i=0;i<N;i++)
    {
         best_score[i]=-1;
+        best_ener[i]=0;
         pitch[i]=start;
    }
 
-
-   energy[0]=inner_prod(swn-start, swn-start, len);
-   e0=inner_prod(swn, swn, len);
-   for (i=start;i<=end;i++)
+   energy[0]=inner_prod(sw-start, sw-start, len);
+   e0=inner_prod(sw, sw, len);
+   for (i=start;i<end;i++)
    {
       /* Update energy for next pitch*/
-      energy[i-start+1] = SUB32(ADD32(energy[i-start],SHR32(MULT16_16(swn[-i-1],swn[-i-1]),6)), SHR32(MULT16_16(swn[-i+len-1],swn[-i+len-1]),6));
+      energy[i-start+1] = SUB32(ADD32(energy[i-start],SHR32(MULT16_16(sw[-i-1],sw[-i-1]),6)), SHR32(MULT16_16(sw[-i+len-1],sw[-i+len-1]),6));
       if (energy[i-start+1] < 0)
          energy[i-start+1] = 0;
    }
 
-   pitch_xcorr(swn, swn-end, corr, len, end-start+1, stack);
+   pitch_xcorr(sw, sw-end, corr, len, end-start+1, stack);
 
+   /* FIXME: Fixed-point and floating-point code should be merged */
 #ifdef FIXED_POINT
    {
       VARDECL(spx_word16_t *corr16);
       VARDECL(spx_word16_t *ener16);
       ALLOC(corr16, end-start+1, spx_word16_t);
       ALLOC(ener16, end-start+1, spx_word16_t);
-      normalize16(corr, corr16, 16384, end-start+1);
-      normalize16(energy, ener16, 16384, end-start+1);
+      /* Normalize to 180 so we can square it and it still fits in 16 bits */
+      normalize16(corr, corr16, 180, end-start+1);
+      normalize16(energy, ener16, 180, end-start+1);
 
       for (i=start;i<=end;i++)
       {
-         spx_word16_t g;
-         spx_word32_t tmp;
-         tmp = corr16[i-start];
-         if (tmp>0)
+         spx_word16_t tmp = MULT16_16_16(corr16[i-start],corr16[i-start]);
+         /* Instead of dividing the tmp by the energy, we multiply on the other side */
+         if (MULT16_16(tmp,best_ener[N-1])>MULT16_16(best_score[N-1],ADD16(1,ener16[i-start])))
          {
-            if (SHR16(corr16[i-start],4)>ener16[i-start])
-               tmp = SHL32(EXTEND32(ener16[i-start]),14);
-            else if (-SHR16(corr16[i-start],4)>ener16[i-start])
-               tmp = -SHL32(EXTEND32(ener16[i-start]),14);
-            else
-               tmp = SHL32(tmp,10);
-            g = DIV32_16(tmp, 8+ener16[i-start]);
-            score[i-start] = MULT16_16(corr16[i-start],g);
-         } else
-         {
-            score[i-start] = 1;
+            /* We can safely put it last and then check */
+            best_score[N-1]=tmp;
+            best_ener[N-1]=ener16[i-start]+1;
+            pitch[N-1]=i;
+            /* Check if it comes in front of others */
+            for (j=0;j<N-1;j++)
+            {
+               if (MULT16_16(tmp,best_ener[j])>MULT16_16(best_score[j],ADD16(1,ener16[i-start])))
+               {
+                  for (k=N-1;k>j;k--)
+                  {
+                     best_score[k]=best_score[k-1];
+                     best_ener[k]=best_ener[k-1];
+                     pitch[k]=pitch[k-1];
+                  }
+                  best_score[j]=tmp;
+                  best_ener[j]=ener16[i-start]+1;
+                  pitch[j]=i;
+                  break;
+               }
+            }
          }
       }
    }
 #else
    for (i=start;i<=end;i++)
    {
-      float g = corr[i-start]/(1+energy[i-start]);
-      if (g>16)
-         g = 16;
-      else if (g<-16)
-         g = -16;
-      score[i-start] = g*corr[i-start];
-   }
-#endif
-
-   /* Extract best scores */
-   for (i=start;i<=end;i++)
-   {
-      if (score[i-start]>best_score[N-1])
+      float tmp = corr[i-start]*corr[i-start];
+      if (tmp*best_ener[N-1]>best_score[N-1]*(1+energy[i-start]))
       {
          for (j=0;j<N;j++)
          {
-            if (score[i-start] > best_score[j])
+            if (tmp*best_ener[j]>best_score[j]*(1+energy[i-start]))
             {
                for (k=N-1;k>j;k--)
                {
                   best_score[k]=best_score[k-1];
+                  best_ener[k]=best_ener[k-1];
                   pitch[k]=pitch[k-1];
                }
-               best_score[j]=score[i-start];
+               best_score[j]=tmp;
+               best_ener[j]=energy[i-start]+1;
                pitch[j]=i;
                break;
             }
          }
       }
    }
+#endif
 
    /* Compute open-loop gain */
    if (gain)
@@ -290,164 +284,131 @@
        }
    }
 }
+#endif
 
+#ifndef OVERRIDE_PITCH_GAIN_SEARCH_3TAP_VQ
+static int pitch_gain_search_3tap_vq(
+  const signed char *gain_cdbk,
+  int                gain_cdbk_size,
+  spx_word16_t      *C16,
+  spx_word16_t       max_gain
+)
+{
+  const signed char *ptr=gain_cdbk;
+  int                best_cdbk=0;
+  spx_word32_t       best_sum=-VERY_LARGE32;
+  spx_word32_t       sum=0;
+  spx_word16_t       g[3];
+  spx_word16_t       pitch_control=64;
+  spx_word16_t       gain_sum;
+  int                i;
+
+  for (i=0;i<gain_cdbk_size;i++) {
+         
+    ptr = gain_cdbk+4*i;
+    g[0]=ADD16((spx_word16_t)ptr[0],32);
+    g[1]=ADD16((spx_word16_t)ptr[1],32);
+    g[2]=ADD16((spx_word16_t)ptr[2],32);
+    gain_sum = (spx_word16_t)ptr[3];
+         
+    sum = compute_pitch_error(C16, g, pitch_control);
+         
+    if (sum>best_sum && gain_sum<=max_gain) {
+      best_sum=sum;
+      best_cdbk=i;
+    }
+  }
+
+  return best_cdbk;
+}
+#endif
 
 /** Finds the best quantized 3-tap pitch predictor by analysis by synthesis */
-static spx_word64_t pitch_gain_search_3tap(
-const spx_sig_t target[],       /* Target vector */
+static spx_word32_t pitch_gain_search_3tap(
+const spx_word16_t target[],       /* Target vector */
 const spx_coef_t ak[],          /* LPCs for this subframe */
 const spx_coef_t awk1[],        /* Weighted LPCs #1 for this subframe */
 const spx_coef_t awk2[],        /* Weighted LPCs #2 for this subframe */
 spx_sig_t exc[],                /* Excitation */
-const void *par,
+const signed char *gain_cdbk,
+int gain_cdbk_size,
 int   pitch,                    /* Pitch value */
 int   p,                        /* Number of LPC coeffs */
 int   nsf,                      /* Number of samples in subframe */
 SpeexBits *bits,
 char *stack,
-const spx_sig_t *exc2,
+const spx_word16_t *exc2,
 const spx_word16_t *r,
-spx_sig_t *new_target,
+spx_word16_t *new_target,
 int  *cdbk_index,
-int cdbk_offset,
-int plc_tuning
+int plc_tuning,
+spx_word32_t cumul_gain
 )
 {
    int i,j;
-   VARDECL(spx_sig_t *tmp1);
-   VARDECL(spx_sig_t *tmp2);
-   spx_sig_t *x[3];
-   spx_sig_t *e[3];
+   VARDECL(spx_word16_t *tmp1);
+   VARDECL(spx_word16_t *e);
+   spx_word16_t *x[3];
    spx_word32_t corr[3];
    spx_word32_t A[3][3];
-   int   gain_cdbk_size;
-   const signed char *gain_cdbk;
    spx_word16_t gain[3];
-   spx_word64_t err;
+   spx_word32_t err;
+   spx_word16_t max_gain=128;
+   int          best_cdbk=0;
 
-   const ltp_params *params;
-   params = (const ltp_params*) par;
-   gain_cdbk_size = 1<<params->gain_bits;
-   gain_cdbk = params->gain_cdbk + 3*gain_cdbk_size*cdbk_offset;
-   ALLOC(tmp1, 3*nsf, spx_sig_t);
-   ALLOC(tmp2, 3*nsf, spx_sig_t);
+   ALLOC(tmp1, 3*nsf, spx_word16_t);
+   ALLOC(e, nsf, spx_word16_t);
 
+   if (cumul_gain > 262144)
+      max_gain = 31;
+   
    x[0]=tmp1;
    x[1]=tmp1+nsf;
    x[2]=tmp1+2*nsf;
    
-   e[0]=tmp2;
-   e[1]=tmp2+nsf;
-   e[2]=tmp2+2*nsf;
-   for (i=2;i>=0;i--)
    {
-      int pp=pitch+1-i;
+      VARDECL(spx_mem_t *mm);
+      int pp=pitch-1;
+      ALLOC(mm, p, spx_mem_t);
       for (j=0;j<nsf;j++)
       {
          if (j-pp<0)
-            e[i][j]=exc2[j-pp];
+            e[j]=exc2[j-pp];
          else if (j-pp-pitch<0)
-            e[i][j]=exc2[j-pp-pitch];
+            e[j]=exc2[j-pp-pitch];
          else
-            e[i][j]=0;
+            e[j]=0;
       }
-
-      if (i==2)
-         syn_percep_zero(e[i], ak, awk1, awk2, x[i], nsf, p, stack);
-      else {
-         for (j=0;j<nsf-1;j++)
-            x[i][j+1]=x[i+1][j];
-         x[i][0]=0;
-         for (j=0;j<nsf;j++)
-         {
-            x[i][j]=ADD32(x[i][j],SHL32(MULT16_32_Q15(r[j], e[i][0]),1));
-         }
-      }
+      for (j=0;j<p;j++)
+         mm[j] = 0;
+      iir_mem16(e, ak, e, nsf, p, mm, stack);
+      for (j=0;j<p;j++)
+         mm[j] = 0;
+      filter_mem16(e, awk1, awk2, e, nsf, p, mm, stack);
+      for (j=0;j<nsf;j++)
+         x[2][j] = e[j];
    }
-
-#ifdef FIXED_POINT
+   for (i=1;i>=0;i--)
    {
-      /* If using fixed-point, we need to normalize the signals first */
-      spx_word16_t *y[3];
-      VARDECL(spx_word16_t *ytmp);
-      VARDECL(spx_word16_t *t);
-
-      spx_sig_t max_val=1;
-      int sig_shift;
-      
-      ALLOC(ytmp, 3*nsf, spx_word16_t);
-#if 0
-      ALLOC(y[0], nsf, spx_word16_t);
-      ALLOC(y[1], nsf, spx_word16_t);
-      ALLOC(y[2], nsf, spx_word16_t);
-#else
-      y[0] = ytmp;
-      y[1] = ytmp+nsf;
-      y[2] = ytmp+2*nsf;
-#endif
-      ALLOC(t, nsf, spx_word16_t);
-      for (j=0;j<3;j++)
-      {
-         for (i=0;i<nsf;i++)
-         {
-            spx_sig_t tmp = x[j][i];
-            if (tmp<0)
-               tmp = -tmp;
-            if (tmp > max_val)
-               max_val = tmp;
-         }
-      }
-      for (i=0;i<nsf;i++)
-      {
-         spx_sig_t tmp = target[i];
-         if (tmp<0)
-            tmp = -tmp;
-         if (tmp > max_val)
-            max_val = tmp;
-      }
-
-      sig_shift=0;
-      while (max_val>16384)
-      {
-         sig_shift++;
-         max_val >>= 1;
-      }
-
-      for (j=0;j<3;j++)
-      {
-         for (i=0;i<nsf;i++)
-         {
-            y[j][i] = EXTRACT16(SHR32(x[j][i],sig_shift));
-         }
-      }
-      for (i=0;i<nsf;i++)
-      {
-         t[i] = EXTRACT16(SHR32(target[i],sig_shift));
-      }
-
-      for (i=0;i<3;i++)
-         corr[i]=inner_prod(y[i],t,nsf);
-      
-      for (i=0;i<3;i++)
-         for (j=0;j<=i;j++)
-            A[i][j]=A[j][i]=inner_prod(y[i],y[j],nsf);
+      spx_word16_t e0=exc2[-pitch-1+i];
+      x[i][0]=MULT16_16_Q14(r[0], e0);
+      for (j=0;j<nsf-1;j++)
+         x[i][j+1]=ADD32(x[i+1][j],MULT16_16_P14(r[j+1], e0));
    }
-#else
-   {
-      for (i=0;i<3;i++)
-         corr[i]=inner_prod(x[i],target,nsf);
-      
-      for (i=0;i<3;i++)
-         for (j=0;j<=i;j++)
-            A[i][j]=A[j][i]=inner_prod(x[i],x[j],nsf);
-   }
-#endif
+
+   for (i=0;i<3;i++)
+      corr[i]=inner_prod(x[i],target,nsf);
+   for (i=0;i<3;i++)
+      for (j=0;j<=i;j++)
+         A[i][j]=A[j][i]=inner_prod(x[i],x[j],nsf);
 
    {
       spx_word32_t C[9];
-      const signed char *ptr=gain_cdbk;
-      int best_cdbk=0;
-      spx_word32_t best_sum=0;
+#ifdef FIXED_POINT
+      spx_word16_t C16[9];
+#else
+      spx_word16_t *C16=C;
+#endif      
       C[0]=corr[2];
       C[1]=corr[1];
       C[2]=corr[0];
@@ -461,111 +422,73 @@
       /*plc_tuning *= 2;*/
       if (plc_tuning<2)
          plc_tuning=2;
+      if (plc_tuning>30)
+         plc_tuning=30;
 #ifdef FIXED_POINT
-      C[0] = MAC16_32_Q15(C[0],MULT16_16_16(plc_tuning,-327),C[0]);
-      C[1] = MAC16_32_Q15(C[1],MULT16_16_16(plc_tuning,-327),C[1]);
-      C[2] = MAC16_32_Q15(C[2],MULT16_16_16(plc_tuning,-327),C[2]);
       C[0] = SHL32(C[0],1);
       C[1] = SHL32(C[1],1);
       C[2] = SHL32(C[2],1);
       C[3] = SHL32(C[3],1);
       C[4] = SHL32(C[4],1);
       C[5] = SHL32(C[5],1);
+      C[6] = MAC16_32_Q15(C[6],MULT16_16_16(plc_tuning,655),C[6]);
+      C[7] = MAC16_32_Q15(C[7],MULT16_16_16(plc_tuning,655),C[7]);
+      C[8] = MAC16_32_Q15(C[8],MULT16_16_16(plc_tuning,655),C[8]);
+      normalize16(C, C16, 32767, 9);
 #else
-      C[0]*=1-.01*plc_tuning;
-      C[1]*=1-.01*plc_tuning;
-      C[2]*=1-.01*plc_tuning;
-      C[6]*=.5*(1+.01*plc_tuning);
-      C[7]*=.5*(1+.01*plc_tuning);
-      C[8]*=.5*(1+.01*plc_tuning);
+      C[6]*=.5*(1+.02*plc_tuning);
+      C[7]*=.5*(1+.02*plc_tuning);
+      C[8]*=.5*(1+.02*plc_tuning);
 #endif
-      for (i=0;i<gain_cdbk_size;i++)
-      {
-         spx_word32_t sum=0;
-         spx_word16_t g[3];
-         spx_word16_t pitch_control=64;
-         spx_word16_t gain_sum;
-         
-         ptr = gain_cdbk+3*i;
-         g[0]=ADD16((spx_word16_t)ptr[0],32);
-         g[1]=ADD16((spx_word16_t)ptr[1],32);
-         g[2]=ADD16((spx_word16_t)ptr[2],32);
 
-         /* We favor "safe" pitch values to handle packet loss better */
-         gain_sum = ADD16(ADD16(g[1],MAX16(g[0], 0)),MAX16(g[2], 0));
-         if (gain_sum > 64)
-         {
-            gain_sum = SUB16(gain_sum, 64);
-            if (gain_sum > 127)
-               gain_sum = 127;
+      best_cdbk = pitch_gain_search_3tap_vq(gain_cdbk, gain_cdbk_size, C16, max_gain);
+
 #ifdef FIXED_POINT
-            pitch_control =  SUB16(64,EXTRACT16(PSHR32(MULT16_16(64,MULT16_16_16(plc_tuning, gain_sum)),10)));
-#else
-            pitch_control = 64*(1.-.001*plc_tuning*gain_sum);
-#endif
-            if (pitch_control < 0)
-               pitch_control = 0;
-         }
-         
-         sum = compute_pitch_error(C, g, pitch_control);
-         
-         if (sum>best_sum || i==0)
-         {
-            best_sum=sum;
-            best_cdbk=i;
-         }
-      }
-#ifdef FIXED_POINT
-      gain[0] = ADD16(32,(spx_word16_t)gain_cdbk[best_cdbk*3]);
-      gain[1] = ADD16(32,(spx_word16_t)gain_cdbk[best_cdbk*3+1]);
-      gain[2] = ADD16(32,(spx_word16_t)gain_cdbk[best_cdbk*3+2]);
+      gain[0] = ADD16(32,(spx_word16_t)gain_cdbk[best_cdbk*4]);
+      gain[1] = ADD16(32,(spx_word16_t)gain_cdbk[best_cdbk*4+1]);
+      gain[2] = ADD16(32,(spx_word16_t)gain_cdbk[best_cdbk*4+2]);
       /*printf ("%d %d %d %d\n",gain[0],gain[1],gain[2], best_cdbk);*/
 #else
-      gain[0] = 0.015625*gain_cdbk[best_cdbk*3]  + .5;
-      gain[1] = 0.015625*gain_cdbk[best_cdbk*3+1]+ .5;
-      gain[2] = 0.015625*gain_cdbk[best_cdbk*3+2]+ .5;
+      gain[0] = 0.015625*gain_cdbk[best_cdbk*4]  + .5;
+      gain[1] = 0.015625*gain_cdbk[best_cdbk*4+1]+ .5;
+      gain[2] = 0.015625*gain_cdbk[best_cdbk*4+2]+ .5;
 #endif
       *cdbk_index=best_cdbk;
    }
 
-#ifdef FIXED_POINT
    for (i=0;i<nsf;i++)
-     exc[i]=SHL32(ADD32(ADD32(MULT16_32_Q15(SHL16(gain[0],7),e[2][i]), MULT16_32_Q15(SHL16(gain[1],7),e[1][i])),
-                        MULT16_32_Q15(SHL16(gain[2],7),e[0][i])), 2);
-   
-   err=0;
+      exc[i]=0;
+   for (i=0;i<3;i++)
+   {
+      int j;
+      int tmp1, tmp3;
+      int pp=pitch+1-i;
+      tmp1=nsf;
+      if (tmp1>pp)
+         tmp1=pp;
+      for (j=0;j<tmp1;j++)
+         exc[j]=MAC16_16(exc[j],SHL16(gain[2-i],7),exc2[j-pp]);
+      tmp3=nsf;
+      if (tmp3>pp+pitch)
+         tmp3=pp+pitch;
+      for (j=tmp1;j<tmp3;j++)
+         exc[j]=MAC16_16(exc[j],SHL16(gain[2-i],7),exc2[j-pp-pitch]);
+   }
    for (i=0;i<nsf;i++)
    {
-      spx_word16_t perr2;
-      spx_sig_t tmp = SHL32(ADD32(ADD32(MULT16_32_Q15(SHL16(gain[0],7),x[2][i]),MULT16_32_Q15(SHL16(gain[1],7),x[1][i])),
-                                  MULT16_32_Q15(SHL16(gain[2],7),x[0][i])),2);
-      spx_sig_t perr=SUB32(target[i],tmp);
-      new_target[i] = SUB32(target[i], tmp);
-      perr2 = EXTRACT16(PSHR32(perr,15));
-      err = ADD64(err,MULT16_16(perr2,perr2));
-      
+      spx_word32_t tmp = ADD32(ADD32(MULT16_16(gain[0],x[2][i]),MULT16_16(gain[1],x[1][i])),
+                            MULT16_16(gain[2],x[0][i]));
+      new_target[i] = SUB16(target[i], EXTRACT16(PSHR32(tmp,6)));
    }
-#else
-   for (i=0;i<nsf;i++)
-      exc[i]=gain[0]*e[2][i]+gain[1]*e[1][i]+gain[2]*e[0][i];
-   
-   err=0;
-   for (i=0;i<nsf;i++)
-   {
-      spx_sig_t tmp = gain[2]*x[0][i]+gain[1]*x[1][i]+gain[0]*x[2][i];
-      new_target[i] = target[i] - tmp;
-      err+=new_target[i]*new_target[i];
-   }
-#endif
+   err = inner_prod(new_target, new_target, nsf);
 
    return err;
 }
 
-
 /** Finds the best quantized 3-tap pitch predictor by analysis by synthesis */
 int pitch_search_3tap(
-spx_sig_t target[],                 /* Target vector */
-spx_sig_t *sw,
+spx_word16_t target[],                 /* Target vector */
+spx_word16_t *sw,
 spx_coef_t ak[],                     /* LPCs for this subframe */
 spx_coef_t awk1[],                   /* Weighted LPCs #1 for this subframe */
 spx_coef_t awk2[],                   /* Weighted LPCs #2 for this subframe */
@@ -578,24 +501,32 @@
 int   nsf,                      /* Number of samples in subframe */
 SpeexBits *bits,
 char *stack,
-spx_sig_t *exc2,
+spx_word16_t *exc2,
 spx_word16_t *r,
 int complexity,
 int cdbk_offset,
-int plc_tuning
+int plc_tuning,
+spx_word32_t *cumul_gain
 )
 {
    int i,j;
    int cdbk_index, pitch=0, best_gain_index=0;
    VARDECL(spx_sig_t *best_exc);
-   VARDECL(spx_sig_t *new_target);
-   VARDECL(spx_sig_t *best_target);
+   VARDECL(spx_word16_t *new_target);
+   VARDECL(spx_word16_t *best_target);
    int best_pitch=0;
-   spx_word64_t err, best_err=-1;
+   spx_word32_t err, best_err=-1;
    int N;
    const ltp_params *params;
+   const signed char *gain_cdbk;
+   int   gain_cdbk_size;
+   
    VARDECL(int *nbest);
-
+   
+   params = (const ltp_params*) par;
+   gain_cdbk_size = 1<<params->gain_bits;
+   gain_cdbk = params->gain_cdbk + 4*gain_cdbk_size*cdbk_offset;
+   
    N=complexity;
    if (N>10)
       N=10;
@@ -614,23 +545,24 @@
       return start;
    }
    
-   ALLOC(best_exc, nsf, spx_sig_t);
-   ALLOC(new_target, nsf, spx_sig_t);
-   ALLOC(best_target, nsf, spx_sig_t);
-   
    if (N>end-start+1)
       N=end-start+1;
    if (end != start)
       open_loop_nbest_pitch(sw, start, end, nsf, nbest, NULL, N, stack);
    else
       nbest[0] = start;
+   
+   ALLOC(best_exc, nsf, spx_sig_t);
+   ALLOC(new_target, nsf, spx_word16_t);
+   ALLOC(best_target, nsf, spx_word16_t);
+   
    for (i=0;i<N;i++)
    {
       pitch=nbest[i];
       for (j=0;j<nsf;j++)
          exc[j]=0;
-      err=pitch_gain_search_3tap(target, ak, awk1, awk2, exc, par, pitch, p, nsf,
-                                 bits, stack, exc2, r, new_target, &cdbk_index, cdbk_offset, plc_tuning);
+      err=pitch_gain_search_3tap(target, ak, awk1, awk2, exc, gain_cdbk, gain_cdbk_size, pitch, p, nsf,
+                                 bits, stack, exc2, r, new_target, &cdbk_index, plc_tuning, *cumul_gain);
       if (err<best_err || best_err<0)
       {
          for (j=0;j<nsf;j++)
@@ -642,10 +574,15 @@
          best_gain_index=cdbk_index;
       }
    }
-   
    /*printf ("pitch: %d %d\n", best_pitch, best_gain_index);*/
    speex_bits_pack(bits, best_pitch-start, params->pitch_bits);
    speex_bits_pack(bits, best_gain_index, params->gain_bits);
+#ifdef FIXED_POINT
+   *cumul_gain = MULT16_32_Q13(SHL16(params->gain_cdbk[4*best_gain_index+3],8), MAX32(1024,*cumul_gain));
+#else
+   *cumul_gain = 0.03125*MAX32(1024,*cumul_gain)*params->gain_cdbk[4*best_gain_index+3];
+#endif
+   /*printf ("%f\n", cumul_gain);*/
    /*printf ("encode pitch: %d %d\n", best_pitch, best_gain_index);*/
    for (i=0;i<nsf;i++)
       exc[i]=best_exc[i];
@@ -656,10 +593,11 @@
 }
 
 void pitch_unquant_3tap(
-spx_sig_t exc[],                    /* Excitation */
+spx_word16_t exc[],             /* Input excitation */
+spx_word32_t exc_out[],         /* Output excitation */
 int   start,                    /* Smallest pitch value allowed */
 int   end,                      /* Largest pitch value allowed */
-spx_word16_t pitch_coef,               /* Voicing (pitch) coefficient */
+spx_word16_t pitch_coef,        /* Voicing (pitch) coefficient */
 const void *par,
 int   nsf,                      /* Number of samples in subframe */
 int *pitch_val,
@@ -682,20 +620,20 @@
 
    params = (const ltp_params*) par;
    gain_cdbk_size = 1<<params->gain_bits;
-   gain_cdbk = params->gain_cdbk + 3*gain_cdbk_size*cdbk_offset;
+   gain_cdbk = params->gain_cdbk + 4*gain_cdbk_size*cdbk_offset;
 
    pitch = speex_bits_unpack_unsigned(bits, params->pitch_bits);
    pitch += start;
    gain_index = speex_bits_unpack_unsigned(bits, params->gain_bits);
    /*printf ("decode pitch: %d %d\n", pitch, gain_index);*/
 #ifdef FIXED_POINT
-   gain[0] = ADD16(32,(spx_word16_t)gain_cdbk[gain_index*3]);
-   gain[1] = ADD16(32,(spx_word16_t)gain_cdbk[gain_index*3+1]);
-   gain[2] = ADD16(32,(spx_word16_t)gain_cdbk[gain_index*3+2]);
+   gain[0] = ADD16(32,(spx_word16_t)gain_cdbk[gain_index*4]);
+   gain[1] = ADD16(32,(spx_word16_t)gain_cdbk[gain_index*4+1]);
+   gain[2] = ADD16(32,(spx_word16_t)gain_cdbk[gain_index*4+2]);
 #else
-   gain[0] = 0.015625*gain_cdbk[gain_index*3]+.5;
-   gain[1] = 0.015625*gain_cdbk[gain_index*3+1]+.5;
-   gain[2] = 0.015625*gain_cdbk[gain_index*3+2]+.5;
+   gain[0] = 0.015625*gain_cdbk[gain_index*4]+.5;
+   gain[1] = 0.015625*gain_cdbk[gain_index*4+1]+.5;
+   gain[2] = 0.015625*gain_cdbk[gain_index*4+2]+.5;
 #endif
 
    if (count_lost && pitch > subframe_offset)
@@ -728,66 +666,36 @@
    gain_val[0]=gain[0];
    gain_val[1]=gain[1];
    gain_val[2]=gain[2];
-
+   gain[0] = SHL16(gain[0],7);
+   gain[1] = SHL16(gain[1],7);
+   gain[2] = SHL16(gain[2],7);
+   for (i=0;i<nsf;i++)
+      exc_out[i]=0;
+   for (i=0;i<3;i++)
    {
-      spx_sig_t *e[3];
-      VARDECL(spx_sig_t *tmp2);
-      ALLOC(tmp2, 3*nsf, spx_sig_t);
-      e[0]=tmp2;
-      e[1]=tmp2+nsf;
-      e[2]=tmp2+2*nsf;
-      
-      for (i=0;i<3;i++)
-      {
-         int j;
-         int pp=pitch+1-i;
-#if 0
-         for (j=0;j<nsf;j++)
-         {
-            if (j-pp<0)
-               e[i][j]=exc[j-pp];
-            else if (j-pp-pitch<0)
-               e[i][j]=exc[j-pp-pitch];
-            else
-               e[i][j]=0;
-         }
-#else
-         {
-            int tmp1, tmp3;
-            tmp1=nsf;
-            if (tmp1>pp)
-               tmp1=pp;
-            for (j=0;j<tmp1;j++)
-               e[i][j]=exc[j-pp];
-            tmp3=nsf;
-            if (tmp3>pp+pitch)
-               tmp3=pp+pitch;
-            for (j=tmp1;j<tmp3;j++)
-               e[i][j]=exc[j-pp-pitch];
-            for (j=tmp3;j<nsf;j++)
-               e[i][j]=0;
-         }
-#endif
-      }
-
-#ifdef FIXED_POINT
-      {
-         for (i=0;i<nsf;i++)
-            exc[i]=SHL32(ADD32(ADD32(MULT16_32_Q15(SHL16(gain[0],7),e[2][i]), MULT16_32_Q15(SHL16(gain[1],7),e[1][i])),
-                               MULT16_32_Q15(SHL16(gain[2],7),e[0][i])), 2);
-      }
-#else
-      for (i=0;i<nsf;i++)
-         exc[i]=VERY_SMALL+gain[0]*e[2][i]+gain[1]*e[1][i]+gain[2]*e[0][i];
-#endif
+      int j;
+      int tmp1, tmp3;
+      int pp=pitch+1-i;
+      tmp1=nsf;
+      if (tmp1>pp)
+         tmp1=pp;
+      for (j=0;j<tmp1;j++)
+         exc_out[j]=MAC16_16(exc_out[j],gain[2-i],exc[j-pp]);
+      tmp3=nsf;
+      if (tmp3>pp+pitch)
+         tmp3=pp+pitch;
+      for (j=tmp1;j<tmp3;j++)
+         exc_out[j]=MAC16_16(exc_out[j],gain[2-i],exc[j-pp-pitch]);
    }
+   /*for (i=0;i<nsf;i++)
+   exc[i]=PSHR32(exc32[i],13);*/
 }
 
 
 /** Forced pitch delay and gain */
 int forced_pitch_quant(
-spx_sig_t target[],                 /* Target vector */
-spx_sig_t *sw,
+spx_word16_t target[],                 /* Target vector */
+spx_word16_t *sw,
 spx_coef_t ak[],                     /* LPCs for this subframe */
 spx_coef_t awk1[],                   /* Weighted LPCs #1 for this subframe */
 spx_coef_t awk2[],                   /* Weighted LPCs #2 for this subframe */
@@ -800,30 +708,45 @@
 int   nsf,                      /* Number of samples in subframe */
 SpeexBits *bits,
 char *stack,
-spx_sig_t *exc2,
+spx_word16_t *exc2,
 spx_word16_t *r,
 int complexity,
 int cdbk_offset,
-int plc_tuning
+int plc_tuning,
+spx_word32_t *cumul_gain
 )
 {
    int i;
-   float coef = GAIN_SCALING_1*pitch_coef;
-   if (coef>.99)
-      coef=.99;
-   for (i=0;i<nsf;i++)
+   VARDECL(spx_sig_t *res);
+   ALLOC(res, nsf, spx_sig_t);
+#ifdef FIXED_POINT
+   if (pitch_coef>63)
+      pitch_coef=63;
+#else
+   if (pitch_coef>.99)
+      pitch_coef=.99;
+#endif
+   for (i=0;i<nsf&&i<start;i++)
    {
-      exc[i]=exc[i-start]*coef;
+      exc[i]=MULT16_16(SHL16(pitch_coef, 7),exc2[i-start]);
    }
+   for (;i<nsf;i++)
+   {
+      exc[i]=MULT16_32_Q15(SHL16(pitch_coef, 9),exc[i-start]);
+   }
+   syn_percep_zero(exc, ak, awk1, awk2, res, nsf, p, stack);
+   for (i=0;i<nsf;i++)
+      target[i]=EXTRACT16(SATURATE(SUB32(EXTEND32(target[i]),PSHR32(res[i],SIG_SHIFT-1)),32700));
    return start;
 }
 
 /** Unquantize forced pitch delay and gain */
 void forced_pitch_unquant(
-spx_sig_t exc[],                    /* Excitation */
+spx_word16_t exc[],             /* Input excitation */
+spx_word32_t exc_out[],         /* Output excitation */
 int   start,                    /* Smallest pitch value allowed */
 int   end,                      /* Largest pitch value allowed */
-spx_word16_t pitch_coef,               /* Voicing (pitch) coefficient */
+spx_word16_t pitch_coef,        /* Voicing (pitch) coefficient */
 const void *par,
 int   nsf,                      /* Number of samples in subframe */
 int *pitch_val,
@@ -837,12 +760,17 @@
 )
 {
    int i;
-   float coef = GAIN_SCALING_1*pitch_coef;
-   if (coef>.99)
-      coef=.99;
+#ifdef FIXED_POINT
+   if (pitch_coef>63)
+      pitch_coef=63;
+#else
+   if (pitch_coef>.99)
+      pitch_coef=.99;
+#endif
    for (i=0;i<nsf;i++)
    {
-      exc[i]=exc[i-start]*coef;
+      exc_out[i]=MULT16_16(exc[i-start],SHL16(pitch_coef,7));
+      exc[i] = PSHR(exc_out[i],13);
    }
    *pitch_val = start;
    gain_val[0]=gain_val[2]=0;
diff --git a/pjmedia/src/pjmedia-codec/speex/ltp.h b/pjmedia/src/pjmedia-codec/speex/ltp.h
index 36debbd..bc050c6 100644
--- a/pjmedia/src/pjmedia-codec/speex/ltp.h
+++ b/pjmedia/src/pjmedia-codec/speex/ltp.h
@@ -48,13 +48,16 @@
 #define gain_3tap_to_1tap(g) (ABS(g[1]) + (g[0]>0 ? g[0] : -.5*g[0]) + (g[2]>0 ? g[2] : -.5*g[2]))
 #endif
 
-void open_loop_nbest_pitch(spx_sig_t *sw, int start, int end, int len, int *pitch, spx_word16_t *gain, int N, char *stack);
+spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len);
+void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack);
+
+void open_loop_nbest_pitch(spx_word16_t *sw, int start, int end, int len, int *pitch, spx_word16_t *gain, int N, char *stack);
 
 
 /** Finds the best quantized 3-tap pitch predictor by analysis by synthesis */
 int pitch_search_3tap(
-spx_sig_t target[],                 /* Target vector */
-spx_sig_t *sw,
+spx_word16_t target[],                 /* Target vector */
+spx_word16_t *sw,
 spx_coef_t ak[],                     /* LPCs for this subframe */
 spx_coef_t awk1[],                   /* Weighted LPCs #1 for this subframe */
 spx_coef_t awk2[],                   /* Weighted LPCs #2 for this subframe */
@@ -67,19 +70,21 @@
 int   nsf,                      /* Number of samples in subframe */
 SpeexBits *bits,
 char *stack,
-spx_sig_t *exc2,
+spx_word16_t *exc2,
 spx_word16_t *r,
 int   complexity,
 int   cdbk_offset,
-int plc_tuning
+int plc_tuning,
+spx_word32_t *cumul_gain
 );
 
 /*Unquantize adaptive codebook and update pitch contribution*/
 void pitch_unquant_3tap(
-spx_sig_t exc[],                    /* Excitation */
+spx_word16_t exc[],             /* Input excitation */
+spx_word32_t exc_out[],         /* Output excitation */
 int   start,                    /* Smallest pitch value allowed */
 int   end,                      /* Largest pitch value allowed */
-spx_word16_t pitch_coef,               /* Voicing (pitch) coefficient */
+spx_word16_t pitch_coef,        /* Voicing (pitch) coefficient */
 const void *par,
 int   nsf,                      /* Number of samples in subframe */
 int *pitch_val,
@@ -94,8 +99,8 @@
 
 /** Forced pitch delay and gain */
 int forced_pitch_quant(
-spx_sig_t target[],                 /* Target vector */
-spx_sig_t *sw,
+spx_word16_t target[],                 /* Target vector */
+spx_word16_t *sw,
 spx_coef_t ak[],                     /* LPCs for this subframe */
 spx_coef_t awk1[],                   /* Weighted LPCs #1 for this subframe */
 spx_coef_t awk2[],                   /* Weighted LPCs #2 for this subframe */
@@ -108,19 +113,21 @@
 int   nsf,                      /* Number of samples in subframe */
 SpeexBits *bits,
 char *stack,
-spx_sig_t *exc2,
+spx_word16_t *exc2,
 spx_word16_t *r,
 int complexity,
 int cdbk_offset,
-int plc_tuning
+int plc_tuning,
+spx_word32_t *cumul_gain
 );
 
 /** Unquantize forced pitch delay and gain */
 void forced_pitch_unquant(
-spx_sig_t exc[],                    /* Excitation */
+spx_word16_t exc[],             /* Input excitation */
+spx_word32_t exc_out[],         /* Output excitation */
 int   start,                    /* Smallest pitch value allowed */
 int   end,                      /* Largest pitch value allowed */
-spx_word16_t pitch_coef,               /* Voicing (pitch) coefficient */
+spx_word16_t pitch_coef,        /* Voicing (pitch) coefficient */
 const void *par,
 int   nsf,                      /* Number of samples in subframe */
 int *pitch_val,
diff --git a/pjmedia/src/pjmedia-codec/speex/ltp_arm4.h b/pjmedia/src/pjmedia-codec/speex/ltp_arm4.h
index a5a0bee..7479e8b 100644
--- a/pjmedia/src/pjmedia-codec/speex/ltp_arm4.h
+++ b/pjmedia/src/pjmedia-codec/speex/ltp_arm4.h
@@ -33,7 +33,7 @@
 */
 
 #define OVERRIDE_INNER_PROD
-static spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
+spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
 {
    spx_word32_t sum1=0,sum2=0;
    spx_word16_t *deadx, *deady;
@@ -84,7 +84,7 @@
 }
 
 #define OVERRIDE_PITCH_XCORR
-static void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack)
+void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack)
 {
    int i,j;
    for (i=0;i<nb_pitch;i+=4)
diff --git a/pjmedia/src/pjmedia-codec/speex/ltp_bfin.h b/pjmedia/src/pjmedia-codec/speex/ltp_bfin.h
index e92dbe2..c466902 100644
--- a/pjmedia/src/pjmedia-codec/speex/ltp_bfin.h
+++ b/pjmedia/src/pjmedia-codec/speex/ltp_bfin.h
@@ -34,7 +34,7 @@
 */
 
 #define OVERRIDE_INNER_PROD
-static spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
+spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
 {
    spx_word32_t sum=0;
    __asm__ __volatile__ (
@@ -63,7 +63,7 @@
 }
 
 #define OVERRIDE_PITCH_XCORR
-static void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack)
+void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack)
 {
    corr += nb_pitch - 1;
    __asm__ __volatile__ (
@@ -109,52 +109,41 @@
 }
 
 #define OVERRIDE_COMPUTE_PITCH_ERROR
-static inline spx_word32_t compute_pitch_error(spx_word32_t *C, spx_word16_t *g, spx_word16_t pitch_control)
+static inline spx_word32_t compute_pitch_error(spx_word16_t *C, spx_word16_t *g, spx_word16_t pitch_control)
 {
    spx_word32_t sum;
    __asm__ __volatile__
          (
-         "A1 = A0 = 0;\n\t"
+         "A0 = 0;\n\t"
          
-         "R0 = [%1++];\n\t"
+         "R0 = W[%1++];\n\t"
          "R1.L = %2.L*%5.L (IS);\n\t"
-         "R0 <<= 1;\n\t"
-         "A1 += R1.L*R0.L (M), A0 += R1.L*R0.H (IS) || R0 = [%1++];\n\t"
+         "A0 += R1.L*R0.L (IS) || R0 = W[%1++];\n\t"
          
          "R1.L = %3.L*%5.L (IS);\n\t"
-         "R0 <<= 1;\n\t"
-         "A1 += R1.L*R0.L (M), A0 += R1.L*R0.H (IS) || R0 = [%1++];\n\t"
+         "A0 += R1.L*R0.L (IS) || R0 = W[%1++];\n\t"
          
          "R1.L = %4.L*%5.L (IS);\n\t"
-         "R0 <<= 1;\n\t"
-         "A1 += R1.L*R0.L (M), A0 += R1.L*R0.H (IS) || R0 = [%1++];\n\t"
+         "A0 += R1.L*R0.L (IS) || R0 = W[%1++];\n\t"
          
          "R1.L = %2.L*%3.L (IS);\n\t"
-         "R0 <<= 1;\n\t"
-         "A1 -= R1.L*R0.L (M), A0 -= R1.L*R0.H (IS) || R0 = [%1++];\n\t"
+         "A0 -= R1.L*R0.L (IS) || R0 = W[%1++];\n\t"
 
          "R1.L = %4.L*%3.L (IS);\n\t"
-         "R0 <<= 1;\n\t"
-         "A1 -= R1.L*R0.L (M), A0 -= R1.L*R0.H (IS) || R0 = [%1++];\n\t"
+         "A0 -= R1.L*R0.L (IS) || R0 = W[%1++];\n\t"
          
          "R1.L = %4.L*%2.L (IS);\n\t"
-         "R0 <<= 1;\n\t"
-         "A1 -= R1.L*R0.L (M), A0 -= R1.L*R0.H (IS) || R0 = [%1++];\n\t"
+         "A0 -= R1.L*R0.L (IS) || R0 = W[%1++];\n\t"
          
          "R1.L = %2.L*%2.L (IS);\n\t"
-         "R0 <<= 1;\n\t"
-         "A1 -= R1.L*R0.L (M), A0 -= R1.L*R0.H (IS) || R0 = [%1++];\n\t"
+         "A0 -= R1.L*R0.L (IS) || R0 = W[%1++];\n\t"
 
          "R1.L = %3.L*%3.L (IS);\n\t"
-         "R0 <<= 1;\n\t"
-         "A1 -= R1.L*R0.L (M), A0 -= R1.L*R0.H (IS) || R0 = [%1++];\n\t"
+         "A0 -= R1.L*R0.L (IS) || R0 = W[%1++];\n\t"
          
          "R1.L = %4.L*%4.L (IS);\n\t"
-         "R0 <<= 1;\n\t"
-         "A1 -= R1.L*R0.L (M), A0 -= R1.L*R0.H (IS);\n\t"
+         "A0 -= R1.L*R0.L (IS);\n\t"
          
-         "A1 = A1 >>> 16;\n\t"
-         "A0 += A1;\n\t"
          "%0 = A0;\n\t"
    : "=&D" (sum), "=a" (C)
    : "d" (g[0]), "d" (g[1]), "d" (g[2]), "d" (pitch_control), "1" (C)
@@ -163,3 +152,268 @@
    return sum;
 }
 
+#define OVERRIDE_OPEN_LOOP_NBEST_PITCH
+#ifdef OVERRIDE_OPEN_LOOP_NBEST_PITCH
+void open_loop_nbest_pitch(spx_word16_t *sw, int start, int end, int len, int *pitch, spx_word16_t *gain, int N, char *stack)
+{
+   int i,j,k;
+   VARDECL(spx_word32_t *best_score);
+   VARDECL(spx_word32_t *best_ener);
+   spx_word32_t e0;
+   VARDECL(spx_word32_t *corr);
+   VARDECL(spx_word32_t *energy);
+
+   ALLOC(best_score, N, spx_word32_t);
+   ALLOC(best_ener, N, spx_word32_t);
+   ALLOC(corr, end-start+1, spx_word32_t);
+   ALLOC(energy, end-start+2, spx_word32_t);
+
+   for (i=0;i<N;i++)
+   {
+        best_score[i]=-1;
+        best_ener[i]=0;
+        pitch[i]=start;
+   }
+
+   energy[0]=inner_prod(sw-start, sw-start, len);
+   e0=inner_prod(sw, sw, len);
+
+   /* energy update -------------------------------------*/
+
+      __asm__ __volatile__
+      (
+"        P0 = %0;\n\t"
+"        I1 = %1;\n\t"
+"        L1 = 0;\n\t"
+"        I2 = %2;\n\t"
+"        L2 = 0;\n\t"
+"        R2 = [P0++];\n\t"
+"        R3 = 0;\n\t"
+"        LSETUP (eu1, eu2) LC1 = %3;\n\t"
+"eu1:      R1.L = W [I1--] || R0.L = W [I2--] ;\n\t"
+"          R1 = R1.L * R1.L (IS);\n\t"
+"          R0 = R0.L * R0.L (IS);\n\t"
+"          R1 >>>= 6;\n\t"
+"          R1 = R1 + R2;\n\t"
+"          R0 >>>= 6;\n\t"
+"          R1 = R1 - R0;\n\t"
+"          R2 = MAX(R1,R3);\n\t"
+"eu2:      [P0++] = R2;\n\t"
+       : : "d" (energy), "d" (&sw[-start-1]), "d" (&sw[-start+len-1]),
+           "a" (end-start)  
+       : "P0", "I1", "I2", "R0", "R1", "R2", "R3"
+#if (__GNUC__ == 4)
+         , "LC1"
+#endif
+       );
+
+   pitch_xcorr(sw, sw-end, corr, len, end-start+1, stack);
+
+   /* FIXME: Fixed-point and floating-point code should be merged */
+   {
+      VARDECL(spx_word16_t *corr16);
+      VARDECL(spx_word16_t *ener16);
+      ALLOC(corr16, end-start+1, spx_word16_t);
+      ALLOC(ener16, end-start+1, spx_word16_t);
+      /* Normalize to 180 so we can square it and it still fits in 16 bits */
+      normalize16(corr, corr16, 180, end-start+1);
+      normalize16(energy, ener16, 180, end-start+1);
+
+      if (N == 1) {
+	/* optimised asm to handle N==1 case */
+      __asm__ __volatile__
+      (
+"        I0 = %1;\n\t"                     /* I0: corr16[]    */
+"        L0 = 0;\n\t"
+"        I1 = %2;\n\t"                     /* I1: energy      */
+"        L1 = 0;\n\t"
+"        R2 = -1;\n\t"                     /* R2: best score  */
+"        R3 = 0;\n\t"                      /* R3: best energy */
+"        P0 = %4;\n\t"                     /* P0: best pitch  */
+"        P1 = %4;\n\t"                     /* P1: counter     */
+"        LSETUP (sl1, sl2) LC1 = %3;\n\t"
+"sl1:      R0.L = W [I0++] || R1.L = W [I1++];\n\t"         
+"          R0 = R0.L * R0.L (IS);\n\t"
+"          R1   += 1;\n\t"
+"          R4   = R0.L * R3.L;\n\t"
+"          R5   = R2.L * R1.L;\n\t"
+"          cc   = R5 < R4;\n\t"
+"          if cc R2 = R0;\n\t"
+"          if cc R3 = R1;\n\t"
+"          if cc P0 = P1;\n\t"
+"sl2:      P1 += 1;\n\t"
+"        %0 = P0;\n\t"
+       : "=&d" (pitch[0])
+       : "a" (corr16), "a" (ener16), "a" (end+1-start), "d" (start) 
+       : "P0", "P1", "I0", "I1", "R0", "R1", "R2", "R3", "R4", "R5"
+#if (__GNUC__ == 4)
+         , "LC1"
+#endif
+       );
+
+      }
+      else {
+	for (i=start;i<=end;i++)
+	  {
+	    spx_word16_t tmp = MULT16_16_16(corr16[i-start],corr16[i-start]);
+	    /* Instead of dividing the tmp by the energy, we multiply on the other side */
+	    if (MULT16_16(tmp,best_ener[N-1])>MULT16_16(best_score[N-1],ADD16(1,ener16[i-start])))
+	      {
+		/* We can safely put it last and then check */
+		best_score[N-1]=tmp;
+		best_ener[N-1]=ener16[i-start]+1;
+		pitch[N-1]=i;
+		/* Check if it comes in front of others */
+		for (j=0;j<N-1;j++)
+		  {
+		    if (MULT16_16(tmp,best_ener[j])>MULT16_16(best_score[j],ADD16(1,ener16[i-start])))
+		      {
+			for (k=N-1;k>j;k--)
+			  {
+			    best_score[k]=best_score[k-1];
+			    best_ener[k]=best_ener[k-1];
+			    pitch[k]=pitch[k-1];
+			  }
+			best_score[j]=tmp;
+			best_ener[j]=ener16[i-start]+1;
+			pitch[j]=i;
+			break;
+		      }
+		  }
+	      }
+	  }
+      }
+   }
+
+   /* Compute open-loop gain */
+   if (gain)
+   {
+       for (j=0;j<N;j++)
+       {
+          spx_word16_t g;
+          i=pitch[j];
+          g = DIV32(corr[i-start], 10+SHR32(MULT16_16(spx_sqrt(e0),spx_sqrt(energy[i-start])),6));
+          /* FIXME: g = max(g,corr/energy) */
+                   if (g<0)
+                   g = 0;
+             gain[j]=g;
+       }
+   }
+}
+#endif
+
+#define OVERRIDE_PITCH_GAIN_SEARCH_3TAP_VQ
+#ifdef OVERRIDE_PITCH_GAIN_SEARCH_3TAP_VQ
+static int pitch_gain_search_3tap_vq(
+  const signed char *gain_cdbk,
+  int                gain_cdbk_size,
+  spx_word16_t      *C16,
+  spx_word16_t       max_gain
+)
+{
+  const signed char *ptr=gain_cdbk;
+  int                best_cdbk=0;
+  spx_word32_t       best_sum=-VERY_LARGE32;
+  spx_word32_t       sum=0;
+  spx_word16_t       g[3];
+  spx_word16_t       pitch_control=64;
+  spx_word16_t       gain_sum;
+  int                i;
+
+      /* fast asm version of VQ codebook search */
+
+      __asm__ __volatile__
+      (
+
+"        P0 = %2;\n\t"                     /* P0: ptr to gain_cdbk */
+"        L1 = 0;\n\t"                      /* no circ addr for L1  */
+"        %0 = 0;\n\t"                      /* %0: best_sum         */
+"        %1 = 0;\n\t"                      /* %1: best_cbdk        */
+"        P1 = 0;\n\t"                      /* P1: loop counter     */
+"        R5 = 64;\n\t"                     /* R5: pitch_control    */
+
+"        LSETUP (pgs1, pgs2) LC1 = %4;\n\t"
+"pgs1:     R2  = B [P0++] (X);\n\t"        /* R2: g[0]             */
+"          R3  = B [P0++] (X);\n\t"        /* R3: g[1]             */
+"          R4  = B [P0++] (X);\n\t"        /* R4: g[2]             */
+"          R2 += 32;\n\t"
+"          R3 += 32;\n\t"
+"          R4 += 32;\n\t"
+
+"          R0  = B [P0++] (X);\n\t"              
+"          B0  = R0;\n\t"                  /* BO: gain_sum         */
+          
+           /* compute_pitch_error() -------------------------------*/
+
+"          I1 = %3;\n\t"                   /* I1: ptr to C         */
+"          A0 = 0;\n\t"
+         
+"          R0.L = W[I1++];\n\t"
+"          R1.L = R2.L*R5.L (IS);\n\t"
+"          A0 += R1.L*R0.L (IS) || R0.L = W[I1++];\n\t"
+         
+"          R1.L = R3.L*R5.L (IS);\n\t"
+"          A0 += R1.L*R0.L (IS) || R0.L = W[I1++];\n\t"
+         
+"          R1.L = R4.L*R5.L (IS);\n\t"
+"          A0 += R1.L*R0.L (IS) || R0.L = W[I1++];\n\t"
+         
+"          R1.L = R2.L*R3.L (IS);\n\t"
+"          A0 -= R1.L*R0.L (IS) || R0.L = W[I1++];\n\t"
+
+"          R1.L = R4.L*R3.L (IS);\n\t"
+"          A0 -= R1.L*R0.L (IS) || R0.L = W[I1++];\n\t"
+         
+"          R1.L = R4.L*R2.L (IS);\n\t"
+"          A0 -= R1.L*R0.L (IS) || R0.L = W[I1++];\n\t"
+         
+"          R1.L = R2.L*R2.L (IS);\n\t"
+"          A0 -= R1.L*R0.L (IS) || R0.L = W[I1++];\n\t"
+
+"          R1.L = R3.L*R3.L (IS);\n\t"
+"          A0 -= R1.L*R0.L (IS) || R0.L = W[I1++];\n\t"
+         
+"          R1.L = R4.L*R4.L (IS);\n\t"
+"          R0 = (A0 -= R1.L*R0.L) (IS);\n\t"
+
+/*
+    Re-arrange the if-then to code efficiently on the Blackfin:
+
+      if (sum>best_sum && gain_sum<=max_gain)   ------ (1)
+
+      if (sum>best_sum && !(gain_sum>max_gain)) ------ (2)
+
+      if (max_gain<=gain_sum) {                 ------ (3)
+      sum = -VERY_LARGE32;
+      }
+      if (best_sum<=sum)
+
+    The blackin cc instructions are all of the form:
+
+      cc = x < y (or cc = x <= y)
+*/
+"          R1 = B0\n\t"
+"          R2 = %5\n\t"
+"          R3 = %6\n\t"
+"          cc = R2 <= R1;\n\t" 
+"          if cc R0 = R3;\n\t"
+"          cc = %0 <= R0;\n\t"
+"          if cc %0 = R0;\n\t"
+"          if cc %1 = P1;\n\t"
+
+"pgs2:     P1 += 1;\n\t"
+   
+       : "=&d" (best_sum), "=&d" (best_cdbk) 
+       : "a" (gain_cdbk), "a" (C16), "a" (gain_cdbk_size), "a" (max_gain),
+         "b" (-VERY_LARGE32)
+       : "R0", "R1", "R2", "R3", "R4", "R5", "P0", 
+         "P1", "I1", "L1", "A0", "B0"
+#if (__GNUC__ == 4)
+         , "LC1"
+#endif
+       );
+
+  return best_cdbk;
+}
+#endif
+
diff --git a/pjmedia/src/pjmedia-codec/speex/ltp_sse.h b/pjmedia/src/pjmedia-codec/speex/ltp_sse.h
index 94c0012..bed6eaa 100644
--- a/pjmedia/src/pjmedia-codec/speex/ltp_sse.h
+++ b/pjmedia/src/pjmedia-codec/speex/ltp_sse.h
@@ -35,7 +35,7 @@
 #include <xmmintrin.h>
 
 #define OVERRIDE_INNER_PROD
-static float inner_prod(const float *a, const float *b, int len)
+float inner_prod(const float *a, const float *b, int len)
 {
    int i;
    float ret;
@@ -54,7 +54,7 @@
 }
 
 #define OVERRIDE_PITCH_XCORR
-static void pitch_xcorr(const float *_x, const float *_y, float *corr, int len, int nb_pitch, char *stack)
+void pitch_xcorr(const float *_x, const float *_y, float *corr, int len, int nb_pitch, char *stack)
 {
    int i, offset;
    VARDECL(__m128 *x);
diff --git a/pjmedia/src/pjmedia-codec/speex/mdf.c b/pjmedia/src/pjmedia-codec/speex/mdf.c
index 0e7219c..eabf433 100644
--- a/pjmedia/src/pjmedia-codec/speex/mdf.c
+++ b/pjmedia/src/pjmedia-codec/speex/mdf.c
@@ -90,7 +90,7 @@
 #endif
 
 #ifdef FIXED_POINT
-static const spx_float_t MIN_LEAK = ((spx_float_t){16777, -24});
+static const spx_float_t MIN_LEAK = {16777, -24};
 #define TOP16(x) ((x)>>16)
 #else
 static const spx_float_t MIN_LEAK = .001f;
@@ -140,9 +140,13 @@
    spx_word16_t preemph;
    spx_word16_t notch_radius;
    spx_mem_t notch_mem[2];
+
+   /* NOTE: If you only use speex_echo_cancel() and want to save some memory, remove this */
+   spx_int16_t *play_buf;
+   int play_buf_pos;
 };
 
-static inline void filter_dc_notch16(spx_int16_t *in, spx_word16_t radius, spx_word16_t *out, int len, spx_mem_t *mem)
+static inline void filter_dc_notch16(const spx_int16_t *in, spx_word16_t radius, spx_word16_t *out, int len, spx_mem_t *mem)
 {
    int i;
    spx_word16_t den2;
@@ -166,17 +170,15 @@
    }
 }
 
-static inline spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
+static inline spx_word32_t mdf_inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
 {
    spx_word32_t sum=0;
-   len >>= 2;
+   len >>= 1;
    while(len--)
    {
       spx_word32_t part=0;
       part = MAC16_16(part,*x++,*y++);
       part = MAC16_16(part,*x++,*y++);
-      part = MAC16_16(part,*x++,*y++);
-      part = MAC16_16(part,*x++,*y++);
       /* HINT: If you had a 40-bit accumulator, you could shift only at the end */
       sum = ADD32(sum,SHR32(part,6));
    }
@@ -184,7 +186,7 @@
 }
 
 /** Compute power spectrum of a half-complex (packed) vector */
-static inline void power_spectrum(spx_word16_t *X, spx_word32_t *ps, int N)
+static inline void power_spectrum(const spx_word16_t *X, spx_word32_t *ps, int N)
 {
    int i, j;
    ps[0]=MULT16_16(X[0],X[0]);
@@ -197,7 +199,7 @@
 
 /** Compute cross-power spectrum of a half-complex (packed) vectors and add to acc */
 #ifdef FIXED_POINT
-static inline void spectral_mul_accum(spx_word16_t *X, spx_word32_t *Y, spx_word16_t *acc, int N, int M)
+static inline void spectral_mul_accum(const spx_word16_t *X, const spx_word32_t *Y, spx_word16_t *acc, int N, int M)
 {
    int i,j;
    spx_word32_t tmp1=0,tmp2=0;
@@ -225,7 +227,7 @@
    acc[N-1] = PSHR32(tmp1,WEIGHT_SHIFT);
 }
 #else
-static inline void spectral_mul_accum(spx_word16_t *X, spx_word32_t *Y, spx_word16_t *acc, int N, int M)
+static inline void spectral_mul_accum(const spx_word16_t *X, const spx_word32_t *Y, spx_word16_t *acc, int N, int M)
 {
    int i,j;
    for (i=0;i<N;i++)
@@ -246,7 +248,7 @@
 #endif
 
 /** Compute weighted cross-power spectrum of a half-complex (packed) vector with conjugate */
-static inline void weighted_spectral_mul_conj(spx_float_t *w, spx_word16_t *X, spx_word16_t *Y, spx_word32_t *prod, int N)
+static inline void weighted_spectral_mul_conj(const spx_float_t *w, const spx_word16_t *X, const spx_word16_t *Y, spx_word32_t *prod, int N)
 {
    int i, j;
    prod[0] = FLOAT_MUL32(w[0],MULT16_16(X[0],Y[0]));
@@ -273,10 +275,10 @@
    st->sum_adapt = 0;
    /* FIXME: Make that an init option (new API call?) */
    st->sampling_rate = 8000;
-   st->spec_average = DIV32_16(SHL32(st->frame_size, 15), st->sampling_rate);
+   st->spec_average = DIV32_16(SHL32(EXTEND32(st->frame_size), 15), st->sampling_rate);
 #ifdef FIXED_POINT
-   st->beta0 = DIV32_16(SHL32(st->frame_size, 16), st->sampling_rate);
-   st->beta_max = DIV32_16(SHL32(st->frame_size, 14), st->sampling_rate);
+   st->beta0 = DIV32_16(SHL32(EXTEND32(st->frame_size), 16), st->sampling_rate);
+   st->beta_max = DIV32_16(SHL32(EXTEND32(st->frame_size), 14), st->sampling_rate);
 #else
    st->beta0 = (2.0f*st->frame_size)/st->sampling_rate;
    st->beta_max = (.5f*st->frame_size)/st->sampling_rate;
@@ -332,6 +334,10 @@
    st->notch_mem[0] = st->notch_mem[1] = 0;
    st->adapted = 0;
    st->Pey = st->Pyy = FLOAT_ONE;
+   
+   st->play_buf = (spx_int16_t*)speex_alloc(2*st->frame_size*sizeof(spx_int16_t));
+   st->play_buf_pos = 0;
+
    return st;
 }
 
@@ -385,12 +391,46 @@
 #ifdef FIXED_POINT
    speex_free(st->wtmp2);
 #endif
+   speex_free(st->play_buf);
    speex_free(st);
 }
 
-extern int fixed_point;
+void speex_echo_capture(SpeexEchoState *st, const spx_int16_t *rec, spx_int16_t *out, spx_int32_t *Yout)
+{
+   int i;
+   if (st->play_buf_pos>=st->frame_size)
+   {
+      speex_echo_cancel(st, rec, st->play_buf, out, Yout);
+      st->play_buf_pos -= st->frame_size;
+      for (i=0;i<st->frame_size;i++)
+         st->play_buf[i] = st->play_buf[i+st->frame_size];
+   } else {
+      speex_warning("no playback frame available");
+      if (st->play_buf_pos!=0)
+      {
+         speex_warning("internal playback buffer corruption?");
+         st->play_buf_pos = 0;
+      }
+      for (i=0;i<st->frame_size;i++)
+         out[i] = rec[i];
+   }
+}
+
+void speex_echo_playback(SpeexEchoState *st, const spx_int16_t *play)
+{
+   if (st->play_buf_pos<=st->frame_size)
+   {
+      int i;
+      for (i=0;i<st->frame_size;i++)
+         st->play_buf[st->play_buf_pos+i] = play[i];
+      st->play_buf_pos += st->frame_size;
+   } else {
+      speex_warning("had to discard a playback frame");
+   }
+}
+
 /** Performs echo cancellation on a frame */
-void speex_echo_cancel(SpeexEchoState *st, short *ref, short *echo, short *out, spx_int32_t *Yout)
+void speex_echo_cancel(SpeexEchoState *st, const spx_int16_t *ref, const spx_int16_t *echo, spx_int16_t *out, spx_int32_t *Yout)
 {
    int i,j;
    int N,M;
@@ -402,6 +442,7 @@
    spx_word16_t RER;
    spx_word32_t tmp32;
    spx_word16_t M_1;
+   int saturated=0;
    
    N = st->window_size;
    M = st->M;
@@ -416,18 +457,46 @@
    M_1 = 1.f/M;
 #endif
 
-   filter_dc_notch16((spx_int16_t*)ref, st->notch_radius, st->d, st->frame_size, st->notch_mem);
+   filter_dc_notch16(ref, st->notch_radius, st->d, st->frame_size, st->notch_mem);
    /* Copy input data to buffer */
    for (i=0;i<st->frame_size;i++)
    {
       spx_word16_t tmp;
+      spx_word32_t tmp32;
       st->x[i] = st->x[i+st->frame_size];
-      st->x[i+st->frame_size] = SUB16(echo[i], MULT16_16_P15(st->preemph, st->memX));
+      tmp32 = SUB32(EXTEND32(echo[i]), EXTEND32(MULT16_16_P15(st->preemph, st->memX)));
+#ifdef FIXED_POINT
+      /*FIXME: If saturation occurs here, we need to freeze adaptation for M frames (not just one) */
+      if (tmp32 > 32767)
+      {
+         tmp32 = 32767;
+         saturated = 1;
+      }      
+      if (tmp32 < -32767)
+      {
+         tmp32 = -32767;
+         saturated = 1;
+      }      
+#endif
+      st->x[i+st->frame_size] = EXTRACT16(tmp32);
       st->memX = echo[i];
       
       tmp = st->d[i];
       st->d[i] = st->d[i+st->frame_size];
-      st->d[i+st->frame_size] = SUB16(tmp, MULT16_16_P15(st->preemph, st->memD));
+      tmp32 = SUB32(EXTEND32(tmp), EXTEND32(MULT16_16_P15(st->preemph, st->memD)));
+#ifdef FIXED_POINT
+      if (tmp32 > 32767)
+      {
+         tmp32 = 32767;
+         saturated = 1;
+      }      
+      if (tmp32 < -32767)
+      {
+         tmp32 = -32767;
+         saturated = 1;
+      }
+#endif
+      st->d[i+st->frame_size] = tmp32;
       st->memD = tmp;
    }
 
@@ -465,6 +534,12 @@
       else if (tmp_out<-32768)
          tmp_out = -32768;
       tmp_out = ADD32(tmp_out, EXTEND32(MULT16_16_P15(st->preemph, st->memE)));
+      /* This is an arbitrary test for saturation */
+      if (ref[i] <= -32000 || ref[i] >= 32000)
+      {
+         tmp_out = 0;
+         saturated = 1;
+      }
       out[i] = tmp_out;
       st->memE = tmp_out;
    }
@@ -477,9 +552,9 @@
    }
 
    /* Compute a bunch of correlations */
-   See = inner_prod(st->e+st->frame_size, st->e+st->frame_size, st->frame_size);
-   See = ADD32(See, SHR32(10000,6));
-   Syy = inner_prod(st->y+st->frame_size, st->y+st->frame_size, st->frame_size);
+   See = mdf_inner_prod(st->e+st->frame_size, st->e+st->frame_size, st->frame_size);
+   See = ADD32(See, SHR32(EXTEND32(10000),6));
+   Syy = mdf_inner_prod(st->y+st->frame_size, st->y+st->frame_size, st->frame_size);
    
    /* Convert error to frequency domain */
    spx_fft(st->fft_table, st->e, st->E);
@@ -544,8 +619,9 @@
       st->Pey = FLOAT_MULT(MIN_LEAK,st->Pyy);
    if (FLOAT_GT(st->Pey, st->Pyy))
       st->Pey = st->Pyy;
-   /* leak_estimate is the limear regression result */
+   /* leak_estimate is the linear regression result */
    leak_estimate = FLOAT_EXTRACT16(FLOAT_SHL(FLOAT_DIVU(st->Pey, st->Pyy),14));
+   /* This looks like a stupid bug, but it's right (because we convert from Q14 to Q15) */
    if (leak_estimate > 16383)
       leak_estimate = 32767;
    else
@@ -594,7 +670,7 @@
       spx_word32_t Sxx;
       spx_word16_t adapt_rate=0;
 
-      Sxx = inner_prod(st->x+st->frame_size, st->x+st->frame_size, st->frame_size);
+      Sxx = mdf_inner_prod(st->x+st->frame_size, st->x+st->frame_size, st->frame_size);
       /* Temporary adaption rate if filter is not adapted correctly */
 
       tmp32 = MULT16_32_Q15(QCONST16(.15f, 15), Sxx);
@@ -620,12 +696,15 @@
       weighted_spectral_mul_conj(st->power_1, &st->X[j*N], st->E, st->PHI+N*j, N);
    }
 
-   /* Gradient descent */
-   for (i=0;i<M*N;i++)
+   if (!saturated)
    {
-      st->W[i] += st->PHI[i];
-      /* Old value of W in PHI */
-      st->PHI[i] = st->W[i] - st->PHI[i];
+      /* Gradient descent */
+      for (i=0;i<M*N;i++)
+      {
+         st->W[i] += st->PHI[i];
+         /* Old value of W in PHI */
+         st->PHI[i] = st->W[i] - st->PHI[i];
+      }
    }
    
    /* Update weight to prevent circular convolution (MDF / AUMDF) */
@@ -637,7 +716,7 @@
       {
 #ifdef FIXED_POINT
          for (i=0;i<N;i++)
-            st->wtmp2[i] = PSHR32(st->W[j*N+i],NORMALIZE_SCALEDOWN+16);
+            st->wtmp2[i] = EXTRACT16(PSHR32(st->W[j*N+i],NORMALIZE_SCALEDOWN+16));
          spx_ifft(st->fft_table, st->wtmp2, st->wtmp);
          for (i=0;i<st->frame_size;i++)
          {
@@ -645,12 +724,12 @@
          }
          for (i=st->frame_size;i<N;i++)
          {
-            st->wtmp[i]=SHL(st->wtmp[i],NORMALIZE_SCALEUP);
+            st->wtmp[i]=SHL16(st->wtmp[i],NORMALIZE_SCALEUP);
          }
          spx_fft(st->fft_table, st->wtmp, st->wtmp2);
          /* The "-1" in the shift is a sort of kludge that trades less efficient update speed for decrease noise */
          for (i=0;i<N;i++)
-            st->W[j*N+i] -= SHL32(st->wtmp2[i],16+NORMALIZE_SCALEDOWN-NORMALIZE_SCALEUP-1);
+            st->W[j*N+i] -= SHL32(EXTEND32(st->wtmp2[i]),16+NORMALIZE_SCALEDOWN-NORMALIZE_SCALEUP-1);
 #else
          spx_ifft(st->fft_table, &st->W[j*N], st->wtmp);
          for (i=st->frame_size;i<N;i++)
@@ -715,10 +794,10 @@
          break;
       case SPEEX_ECHO_SET_SAMPLING_RATE:
          st->sampling_rate = (*(int*)ptr);
-         st->spec_average = DIV32_16(SHL32(st->frame_size, 15), st->sampling_rate);
+         st->spec_average = DIV32_16(SHL32(EXTEND32(st->frame_size), 15), st->sampling_rate);
 #ifdef FIXED_POINT
-         st->beta0 = DIV32_16(SHL32(st->frame_size, 16), st->sampling_rate);
-         st->beta_max = DIV32_16(SHL32(st->frame_size, 14), st->sampling_rate);
+         st->beta0 = DIV32_16(SHL32(EXTEND32(st->frame_size), 16), st->sampling_rate);
+         st->beta_max = DIV32_16(SHL32(EXTEND32(st->frame_size), 14), st->sampling_rate);
 #else
          st->beta0 = (2.0f*st->frame_size)/st->sampling_rate;
          st->beta_max = (.5f*st->frame_size)/st->sampling_rate;
diff --git a/pjmedia/src/pjmedia-codec/speex/misc.c b/pjmedia/src/pjmedia-codec/speex/misc.c
index fc2cb7c..53bdd0b 100644
--- a/pjmedia/src/pjmedia-codec/speex/misc.c
+++ b/pjmedia/src/pjmedia-codec/speex/misc.c
@@ -196,12 +196,12 @@
 #endif
 
 #ifdef FIXED_POINT
-spx_word32_t speex_rand(spx_word16_t std, spx_int32_t *seed)
+spx_word16_t speex_rand(spx_word16_t std, spx_int32_t *seed)
 {
    spx_word32_t res;
    *seed = 1664525 * *seed + 1013904223;
    res = MULT16_16(EXTRACT16(SHR32(*seed,16)),std);
-   return SUB32(res, SHR(res, 3));
+   return PSHR32(SUB32(res, SHR(res, 3)),14);
 }
 #else
 spx_word16_t speex_rand(spx_word16_t std, spx_int32_t *seed)
@@ -216,19 +216,6 @@
 }
 #endif
 
-void speex_rand_vec(float std, spx_sig_t *data, int len)
-{
-   int i;
-   for (i=0;i<len;i++)
-      data[i]+=SIG_SCALING*3*std*((((float)rand())/RAND_MAX)-.5);
-}
-
-
-/*float speex_rand(float std)
-{
-   return 3*std*((((float)rand())/RAND_MAX)-.5);
-}*/
-
 #ifndef OVERRIDE_SPEEX_PUTC
 void _speex_putc(int ch, void *file)
 {
diff --git a/pjmedia/src/pjmedia-codec/speex/misc.h b/pjmedia/src/pjmedia-codec/speex/misc.h
index 2e69b20..95e5d0c 100644
--- a/pjmedia/src/pjmedia-codec/speex/misc.h
+++ b/pjmedia/src/pjmedia-codec/speex/misc.h
@@ -38,9 +38,34 @@
 #ifndef SPEEX_VERSION
 #define SPEEX_MAJOR_VERSION 1         /**< Major Speex version. */
 #define SPEEX_MINOR_VERSION 1         /**< Minor Speex version. */
-#define SPEEX_MICRO_VERSION 12        /**< Micro Speex version. */
+#define SPEEX_MICRO_VERSION 13        /**< Micro Speex version. */
 #define SPEEX_EXTRA_VERSION ""        /**< Extra Speex version. */
-#define SPEEX_VERSION "speex-1.1.12"  /**< Speex version string. */
+#define SPEEX_VERSION "speex-1.1.13"  /**< Speex version string. */
+#endif
+
+/* A couple test to catch stupid option combinations */
+#ifdef FIXED_POINT
+
+#ifdef _USE_SSE
+#error SSE is only for floating-point
+#endif
+#if ((defined (ARM4_ASM)||defined (ARM4_ASM)) && defined(BFIN_ASM)) || (defined (ARM4_ASM)&&defined(ARM5E_ASM))
+#error Make up your mind. What CPU do you have?
+#endif
+#ifdef VORBIS_PSYCHO
+#error Vorbis-psy model currently not implemented in fixed-point
+#endif
+
+#else
+
+#if defined (ARM4_ASM) || defined(ARM5E_ASM) || defined(BFIN_ASM)
+#error I suppose you can have a [ARM4/ARM5E/Blackfin] that has float instructions?
+#endif
+#ifdef FIXED_POINT_DEBUG
+#error Don't you think enabling fixed-point is a good thing to do if you want to debug that?
+#endif
+
+
 #endif
 
 #include "arch.h"
@@ -88,11 +113,8 @@
 /** Print warning message with integer argument to stderr */
 void speex_warning_int(const char *str, int val);
 
-/** Generate a vector of random numbers */
-void speex_rand_vec(float std, spx_sig_t *data, int len);
-
 /** Generate a random number */
-spx_word32_t speex_rand(spx_word16_t std, spx_int32_t *seed);
+spx_word16_t speex_rand(spx_word16_t std, spx_int32_t *seed);
 
 /** Speex wrapper for putc */
 void _speex_putc(int ch, void *file);
diff --git a/pjmedia/src/pjmedia-codec/speex/modes.c b/pjmedia/src/pjmedia-codec/speex/modes.c
index aa15729..1da236d 100644
--- a/pjmedia/src/pjmedia-codec/speex/modes.c
+++ b/pjmedia/src/pjmedia-codec/speex/modes.c
@@ -190,11 +190,7 @@
    noise_codebook_quant,
    noise_codebook_unquant,
    NULL,
-#ifdef FIXED_POINT
-   22938, 22938, 0, -1,
-#else
-   .7, .7, 0, -1,
-#endif
+   -1,
    43
 };
 
@@ -215,11 +211,7 @@
    split_cb_search_shape_sign,
    split_cb_shape_sign_unquant,
    &split_cb_nb_ulbr,
-#ifdef FIXED_POINT
-   22938, 16384, 11796, 21299,
-#else
-   0.7, 0.5, .36, .65,
-#endif
+   QCONST16(.65,15),
    79
 };
 
@@ -240,11 +232,7 @@
    split_cb_search_shape_sign,
    split_cb_shape_sign_unquant,
    &split_cb_nb_vlbr,
-#ifdef FIXED_POINT
-   22938, 16384, 11796, 18022,
-#else
-   0.7, 0.5, .36, .55,
-#endif
+   QCONST16(.55,15),
    119
 };
 
@@ -265,11 +253,7 @@
    split_cb_search_shape_sign,
    split_cb_shape_sign_unquant,
    &split_cb_nb_lbr,
-#ifdef FIXED_POINT
-   22938, 18022, 9830, 14746,
-#else
-   0.7, 0.55, .30, .45,
-#endif
+   QCONST16(.45,15),
    160
 };
 
@@ -290,11 +274,7 @@
    split_cb_search_shape_sign,
    split_cb_shape_sign_unquant,
    &split_cb_nb_med,
-#ifdef FIXED_POINT
-   22938, 20644, 5243, 11469,
-#else
-   0.7, 0.63, .16, .35,
-#endif
+   QCONST16(.35,15),
    220
 };
 
@@ -315,11 +295,7 @@
    split_cb_search_shape_sign,
    split_cb_shape_sign_unquant,
    &split_cb_nb,
-#ifdef FIXED_POINT
-   22938, 21299, 3932, 8192,
-#else
-   0.7, 0.65, .12, .25,
-#endif
+   QCONST16(.2,15),
    300
 };
 
@@ -340,11 +316,7 @@
    split_cb_search_shape_sign,
    split_cb_shape_sign_unquant,
    &split_cb_sb,
-#ifdef FIXED_POINT
-   22282, 21299, 2294, 3277,
-#else
-   0.68, 0.65, .07, .1,
-#endif
+   QCONST16(.1,15),
    364
 };
 
@@ -365,11 +337,7 @@
    split_cb_search_shape_sign,
    split_cb_shape_sign_unquant,
    &split_cb_nb,
-#ifdef FIXED_POINT
-   21299, 21299, 0, -1,
-#else
-   0.65, 0.65, .0, -1,
-#endif
+   -1,
    492
 };
 
@@ -434,11 +402,7 @@
    NULL,
    NULL,
    NULL,
-#ifdef FIXED_POINT
-   24576, 24576, 0, -1,
-#else
-   .75, .75, .0, -1,
-#endif
+   -1,
    36
 };
 
@@ -463,11 +427,7 @@
 #else
    &split_cb_high_lbr,
 #endif
-#ifdef FIXED_POINT
-   27853, 19661, 8192, -1,
-#else
-   .85, .6, .25, -1,
-#endif
+   -1,
    112
 };
 
@@ -492,12 +452,7 @@
 #else
    &split_cb_high,
 #endif
-
-#ifdef FIXED_POINT
-   24576, 22938, 1638, -1,
-#else
-   .75, .7, .05, -1,
-#endif
+   -1,
    192
 };
 
@@ -521,11 +476,7 @@
 #else
    &split_cb_high,
 #endif
-#ifdef FIXED_POINT
-   24576, 24576, 0, -1,
-#else
-   .75, .75, .0, -1,
-#endif
+   -1,
    352
 };
 
@@ -542,8 +493,8 @@
 #else
    0.9, 0.6, /* gamma1, gamma2 */
 #endif
-   .001,   /*lag_factor*/
-   QCONST16(.0001,15), /*lpc_floor*/
+   .012,   /*lag_factor*/
+   QCONST16(.0002,15), /*lpc_floor*/
    0.9,
    {NULL, &wb_submode1, &wb_submode2, &wb_submode3, &wb_submode4, NULL, NULL, NULL},
    3,
@@ -588,8 +539,8 @@
 #else
    0.9, 0.6, /* gamma1, gamma2 */
 #endif
-   .002,   /*lag_factor*/
-   QCONST16(.0001,15), /*lpc_floor*/
+   .012,   /*lag_factor*/
+   QCONST16(.0002,15), /*lpc_floor*/
    0.7,
    {NULL, &wb_submode1, NULL, NULL, NULL, NULL, NULL, NULL},
    1,
diff --git a/pjmedia/src/pjmedia-codec/speex/modes.h b/pjmedia/src/pjmedia-codec/speex/modes.h
index 945d004..9828ee6 100644
--- a/pjmedia/src/pjmedia-codec/speex/modes.h
+++ b/pjmedia/src/pjmedia-codec/speex/modes.h
@@ -55,17 +55,17 @@
 
 
 /** Long-term predictor quantization */
-typedef int (*ltp_quant_func)(spx_sig_t *, spx_sig_t *, spx_coef_t *, spx_coef_t *, 
+typedef int (*ltp_quant_func)(spx_word16_t *, spx_word16_t *, spx_coef_t *, spx_coef_t *, 
                               spx_coef_t *, spx_sig_t *, const void *, int, int, spx_word16_t, 
-                              int, int, SpeexBits*, char *, spx_sig_t *, spx_word16_t *, int, int, int);
+                              int, int, SpeexBits*, char *, spx_word16_t *, spx_word16_t *, int, int, int, spx_word32_t *);
 
 /** Long-term un-quantize */
-typedef void (*ltp_unquant_func)(spx_sig_t *, int, int, spx_word16_t, const void *, int, int *,
+typedef void (*ltp_unquant_func)(spx_word16_t *, spx_word32_t *, int, int, spx_word16_t, const void *, int, int *,
                                  spx_word16_t *, SpeexBits*, char*, int, int, spx_word16_t, int);
 
 
 /** Innovation quantization function */
-typedef void (*innovation_quant_func)(spx_sig_t *, spx_coef_t *, spx_coef_t *, spx_coef_t *, const void *, int, int, 
+typedef void (*innovation_quant_func)(spx_word16_t *, spx_coef_t *, spx_coef_t *, spx_coef_t *, const void *, int, int, 
                                       spx_sig_t *, spx_word16_t *, SpeexBits *, char *, int, int);
 
 /** Innovation unquantization function */
@@ -84,17 +84,13 @@
    /*Lont-term predictor functions*/
    ltp_quant_func    ltp_quant; /**< Long-term predictor (pitch) quantizer */
    ltp_unquant_func  ltp_unquant; /**< Long-term predictor (pitch) un-quantizer */
-   const void             *ltp_params; /**< Pitch parameters (options) */
+   const void       *ltp_params; /**< Pitch parameters (options) */
 
    /*Quantization of innovation*/
    innovation_quant_func innovation_quant; /**< Innovation quantization */
    innovation_unquant_func innovation_unquant; /**< Innovation un-quantization */
    const void             *innovation_params; /**< Innovation quantization parameters*/
 
-   /*Synthesis filter enhancement*/
-   spx_word16_t      lpc_enh_k1; /**< Enhancer constant */
-   spx_word16_t      lpc_enh_k2; /**< Enhancer constant */
-   spx_word16_t      lpc_enh_k3; /**< Enhancer constant */
    spx_word16_t      comb_gain;  /**< Gain of enhancer comb filter */
 
    int               bits_per_frame; /**< Number of bits per frame after encoding*/
diff --git a/pjmedia/src/pjmedia-codec/speex/nb_celp.c b/pjmedia/src/pjmedia-codec/speex/nb_celp.c
index f6f5c69..2c41649 100644
--- a/pjmedia/src/pjmedia-codec/speex/nb_celp.c
+++ b/pjmedia/src/pjmedia-codec/speex/nb_celp.c
@@ -46,6 +46,7 @@
 #include <speex/speex_bits.h>
 #include "vbr.h"
 #include "misc.h"
+#include "math_approx.h"
 #include <speex/speex_callbacks.h>
 
 #ifdef VORBIS_PSYCHO
@@ -106,6 +107,8 @@
 
 #define sqr(x) ((x)*(x))
 
+extern const spx_word16_t lpc_window[];
+
 void *nb_encoder_init(const SpeexMode *m)
 {
    EncState *st;
@@ -125,9 +128,9 @@
    st->mode=m;
 
    st->frameSize = mode->frameSize;
-   st->windowSize = st->frameSize*3/2;
    st->nbSubframes=mode->frameSize/mode->subframeSize;
    st->subframeSize=mode->subframeSize;
+   st->windowSize = st->frameSize+st->subframeSize;
    st->lpcSize = mode->lpcSize;
    st->gamma1=mode->gamma1;
    st->gamma2=mode->gamma2;
@@ -149,69 +152,50 @@
    st->psy = vorbis_psy_init(8000, 256);
    st->curve = speex_alloc(128*sizeof(float));
    st->old_curve = speex_alloc(128*sizeof(float));
+   st->psy_window = speex_alloc(256*sizeof(float));
 #endif
 
+   st->cumul_gain = 1024;
+
    /* Allocating input buffer */
-   st->inBuf = speex_alloc((st->windowSize+EXTRA_BUFFER)*sizeof(spx_sig_t));
-   st->frame = st->inBuf+EXTRA_BUFFER;
+   st->winBuf = speex_alloc((st->windowSize-st->frameSize)*sizeof(spx_word16_t));
    /* Allocating excitation buffer */
-   st->excBuf = speex_alloc((mode->frameSize+mode->pitchEnd+1)*sizeof(spx_sig_t));
-   st->exc = st->excBuf + mode->pitchEnd + 1;
-   st->swBuf = speex_alloc((mode->frameSize+mode->pitchEnd+1)*sizeof(spx_sig_t));
-   st->sw = st->swBuf + mode->pitchEnd + 1;
+   st->excBuf = speex_alloc((mode->frameSize+mode->pitchEnd+2)*sizeof(spx_word16_t));
+   st->exc = st->excBuf + mode->pitchEnd + 2;
+   st->swBuf = speex_alloc((mode->frameSize+mode->pitchEnd+2)*sizeof(spx_word16_t));
+   st->sw = st->swBuf + mode->pitchEnd + 2;
 
-   st->innov = speex_alloc((st->frameSize)*sizeof(spx_sig_t));
-
-   /* Asymmetric "pseudo-Hamming" window */
-   {
-      int part1, part2;
-      part1=st->frameSize - (st->subframeSize>>1);
-      part2=(st->frameSize>>1) + (st->subframeSize>>1);
-      st->window = speex_alloc((st->windowSize)*sizeof(spx_word16_t));
-      for (i=0;i<part1;i++)
-         st->window[i]=(spx_word16_t)(SIG_SCALING*(.54-.46*cos(M_PI*i/part1)));
-      for (i=0;i<part2;i++)
-         st->window[part1+i]=(spx_word16_t)(SIG_SCALING*(.54+.46*cos(M_PI*i/part2)));
-   }
+   st->window= lpc_window;
+   
    /* Create the window for autocorrelation (lag-windowing) */
    st->lagWindow = speex_alloc((st->lpcSize+1)*sizeof(spx_word16_t));
    for (i=0;i<st->lpcSize+1;i++)
       st->lagWindow[i]=16384*exp(-.5*sqr(2*M_PI*st->lag_factor*i));
 
-   st->autocorr = speex_alloc((st->lpcSize+1)*sizeof(spx_word16_t));
-
-   st->lpc = speex_alloc((st->lpcSize)*sizeof(spx_coef_t));
-   st->interp_lpc = speex_alloc((st->lpcSize)*sizeof(spx_coef_t));
-   st->interp_qlpc = speex_alloc((st->lpcSize)*sizeof(spx_coef_t));
-   st->bw_lpc1 = speex_alloc((st->lpcSize)*sizeof(spx_coef_t));
-   st->bw_lpc2 = speex_alloc((st->lpcSize)*sizeof(spx_coef_t));
-
-   st->lsp = speex_alloc((st->lpcSize)*sizeof(spx_lsp_t));
-   st->qlsp = speex_alloc((st->lpcSize)*sizeof(spx_lsp_t));
    st->old_lsp = speex_alloc((st->lpcSize)*sizeof(spx_lsp_t));
    st->old_qlsp = speex_alloc((st->lpcSize)*sizeof(spx_lsp_t));
-   st->interp_lsp = speex_alloc((st->lpcSize)*sizeof(spx_lsp_t));
-   st->interp_qlsp = speex_alloc((st->lpcSize)*sizeof(spx_lsp_t));
-
    st->first = 1;
    for (i=0;i<st->lpcSize;i++)
    {
-      st->lsp[i]=LSP_SCALING*(M_PI*((float)(i+1)))/(st->lpcSize+1);
+      st->old_lsp[i]=LSP_SCALING*(M_PI*((float)(i+1)))/(st->lpcSize+1);
    }
 
    st->mem_sp = speex_alloc((st->lpcSize)*sizeof(spx_mem_t));
    st->mem_sw = speex_alloc((st->lpcSize)*sizeof(spx_mem_t));
    st->mem_sw_whole = speex_alloc((st->lpcSize)*sizeof(spx_mem_t));
    st->mem_exc = speex_alloc((st->lpcSize)*sizeof(spx_mem_t));
+   st->mem_exc2 = speex_alloc((st->lpcSize)*sizeof(spx_mem_t));
 
    st->pi_gain = speex_alloc((st->nbSubframes)*sizeof(spx_word32_t));
-
+   st->innov_save = NULL;
+   
    st->pitch = speex_alloc((st->nbSubframes)*sizeof(int));
 
    st->vbr = speex_alloc(sizeof(VBRState));
    vbr_init(st->vbr);
    st->vbr_quality = 8;
    st->vbr_enabled = 0;
+   st->vbr_max = 0;
    st->vad_enabled = 0;
    st->dtx_enabled = 0;
    st->abr_enabled = 0;
@@ -236,30 +220,19 @@
    speex_free_scratch(st->stack);
 #endif
 
-   speex_free (st->inBuf);
+   speex_free (st->winBuf);
    speex_free (st->excBuf);
-   speex_free (st->innov);
-   speex_free (st->interp_qlpc);
-   speex_free (st->qlsp);
    speex_free (st->old_qlsp);
-   speex_free (st->interp_qlsp);
    speex_free (st->swBuf);
 
-   speex_free (st->window);
    speex_free (st->lagWindow);
-   speex_free (st->autocorr);
-   speex_free (st->lpc);
-   speex_free (st->lsp);
 
-   speex_free (st->interp_lpc);
-   speex_free (st->bw_lpc1);
-   speex_free (st->bw_lpc2);
    speex_free (st->old_lsp);
-   speex_free (st->interp_lsp);
    speex_free (st->mem_sp);
    speex_free (st->mem_sw);
    speex_free (st->mem_sw_whole);
    speex_free (st->mem_exc);
+   speex_free (st->mem_exc2);
    speex_free (st->pi_gain);
    speex_free (st->pitch);
 
@@ -270,6 +243,7 @@
    vorbis_psy_destroy(st->psy);
    speex_free (st->curve);
    speex_free (st->old_curve);
+   speex_free (st->psy_window);
 #endif
 
    /*Free state memory... should be last*/
@@ -283,12 +257,23 @@
    int ol_pitch;
    spx_word16_t ol_pitch_coef;
    spx_word32_t ol_gain;
-   VARDECL(spx_sig_t *res);
-   VARDECL(spx_sig_t *target);
+   VARDECL(spx_word16_t *ringing);
+   VARDECL(spx_word16_t *target);
+   VARDECL(spx_sig_t *innov);
+   VARDECL(spx_word32_t *exc32);
    VARDECL(spx_mem_t *mem);
+   VARDECL(spx_coef_t *bw_lpc1);
+   VARDECL(spx_coef_t *bw_lpc2);
+   VARDECL(spx_coef_t *lpc);
+   VARDECL(spx_lsp_t *lsp);
+   VARDECL(spx_lsp_t *qlsp);
+   VARDECL(spx_lsp_t *interp_lsp);
+   VARDECL(spx_lsp_t *interp_qlsp);
+   VARDECL(spx_coef_t *interp_lpc);
+   VARDECL(spx_coef_t *interp_qlpc);
    char *stack;
    VARDECL(spx_word16_t *syn_resp);
-   VARDECL(spx_sig_t *real_exc);
+   VARDECL(spx_word16_t *real_exc);
 #ifdef EPIC_48K
    int pitch_half[2];
    int ol_pitch_id=0;
@@ -298,79 +283,85 @@
    st=(EncState *)state;
    stack=st->stack;
 
-   /* Copy new data in input buffer */
-   speex_move(st->inBuf, st->inBuf+st->frameSize, (EXTRA_BUFFER+st->windowSize-st->frameSize)*sizeof(spx_sig_t));
-   for (i=0;i<st->frameSize;i++)
-      st->inBuf[st->windowSize-st->frameSize+i+EXTRA_BUFFER] = SHL32(EXTEND32(in[i]), SIG_SHIFT);
+   ALLOC(lpc, st->lpcSize, spx_coef_t);
+   ALLOC(bw_lpc1, st->lpcSize, spx_coef_t);
+   ALLOC(bw_lpc2, st->lpcSize, spx_coef_t);
+   ALLOC(lsp, st->lpcSize, spx_lsp_t);
+   ALLOC(qlsp, st->lpcSize, spx_lsp_t);
+   ALLOC(interp_lsp, st->lpcSize, spx_lsp_t);
+   ALLOC(interp_qlsp, st->lpcSize, spx_lsp_t);
+   ALLOC(interp_lpc, st->lpcSize, spx_coef_t);
+   ALLOC(interp_qlpc, st->lpcSize, spx_coef_t);
 
    /* Move signals 1 frame towards the past */
-   speex_move(st->excBuf, st->excBuf+st->frameSize, (st->max_pitch+1)*sizeof(spx_sig_t));
-   speex_move(st->swBuf, st->swBuf+st->frameSize, (st->max_pitch+1)*sizeof(spx_sig_t));
+   speex_move(st->excBuf, st->excBuf+st->frameSize, (st->max_pitch+2)*sizeof(spx_word16_t));
+   speex_move(st->swBuf, st->swBuf+st->frameSize, (st->max_pitch+2)*sizeof(spx_word16_t));
 
    {
       VARDECL(spx_word16_t *w_sig);
+      VARDECL(spx_word16_t *autocorr);
       ALLOC(w_sig, st->windowSize, spx_word16_t);
+      ALLOC(autocorr, st->lpcSize+1, spx_word16_t);
       /* Window for analysis */
-      for (i=0;i<st->windowSize;i++)
-         w_sig[i] = EXTRACT16(SHR32(MULT16_16(EXTRACT16(SHR32(st->frame[i],SIG_SHIFT)),st->window[i]),SIG_SHIFT));
-
+      for (i=0;i<st->windowSize-st->frameSize;i++)
+         w_sig[i] = EXTRACT16(SHR32(MULT16_16(st->winBuf[i],st->window[i]),SIG_SHIFT));
+      for (;i<st->windowSize;i++)
+         w_sig[i] = EXTRACT16(SHR32(MULT16_16(in[i-st->windowSize+st->frameSize],st->window[i]),SIG_SHIFT));
       /* Compute auto-correlation */
-      _spx_autocorr(w_sig, st->autocorr, st->lpcSize+1, st->windowSize);
-   }
-   st->autocorr[0] = ADD16(st->autocorr[0],MULT16_16_Q15(st->autocorr[0],st->lpc_floor)); /* Noise floor in auto-correlation domain */
+      _spx_autocorr(w_sig, autocorr, st->lpcSize+1, st->windowSize);
+      autocorr[0] = ADD16(autocorr[0],MULT16_16_Q15(autocorr[0],st->lpc_floor)); /* Noise floor in auto-correlation domain */
 
-   /* Lag windowing: equivalent to filtering in the power-spectrum domain */
-   for (i=0;i<st->lpcSize+1;i++)
-      st->autocorr[i] = MULT16_16_Q14(st->autocorr[i],st->lagWindow[i]);
+      /* Lag windowing: equivalent to filtering in the power-spectrum domain */
+      for (i=0;i<st->lpcSize+1;i++)
+         autocorr[i] = MULT16_16_Q14(autocorr[i],st->lagWindow[i]);
 
-   /* Levinson-Durbin */
-   _spx_lpc(st->lpc, st->autocorr, st->lpcSize);
-
-   /* LPC to LSPs (x-domain) transform */
-   roots=lpc_to_lsp (st->lpc, st->lpcSize, st->lsp, 15, LSP_DELTA1, stack);
-   /* Check if we found all the roots */
-   if (roots!=st->lpcSize)
-   {
-      /* Search again if we can afford it */
-      if (st->complexity>1)
-         roots = lpc_to_lsp (st->lpc, st->lpcSize, st->lsp, 11, LSP_DELTA2, stack);
-      if (roots!=st->lpcSize) 
+      /* Levinson-Durbin */
+      _spx_lpc(lpc, autocorr, st->lpcSize);
+      /* LPC to LSPs (x-domain) transform */
+      roots=lpc_to_lsp (lpc, st->lpcSize, lsp, 10, LSP_DELTA1, stack);
+      /* Check if we found all the roots */
+      if (roots!=st->lpcSize)
       {
          /*If we can't find all LSP's, do some damage control and use previous filter*/
          for (i=0;i<st->lpcSize;i++)
          {
-            st->lsp[i]=st->old_lsp[i];
+            lsp[i]=st->old_lsp[i];
          }
       }
    }
 
 
 
+
    /* Whole frame analysis (open-loop estimation of pitch and excitation gain) */
    {
       if (st->first)
          for (i=0;i<st->lpcSize;i++)
-            st->interp_lsp[i] = st->lsp[i];
+            interp_lsp[i] = lsp[i];
       else
-         lsp_interpolate(st->old_lsp, st->lsp, st->interp_lsp, st->lpcSize, st->nbSubframes, st->nbSubframes<<1);
+         lsp_interpolate(st->old_lsp, lsp, interp_lsp, st->lpcSize, st->nbSubframes, st->nbSubframes<<1);
 
-      lsp_enforce_margin(st->interp_lsp, st->lpcSize, LSP_MARGIN);
+      lsp_enforce_margin(interp_lsp, st->lpcSize, LSP_MARGIN);
 
       /* Compute interpolated LPCs (unquantized) for whole frame*/
-      lsp_to_lpc(st->interp_lsp, st->interp_lpc, st->lpcSize,stack);
+      lsp_to_lpc(interp_lsp, interp_lpc, st->lpcSize,stack);
 
 
       /*Open-loop pitch*/
-      if (!st->submodes[st->submodeID] || st->vbr_enabled || st->vad_enabled || SUBMODE(forced_pitch_gain) ||
+      if (st->complexity>2 || !st->submodes[st->submodeID] || st->vbr_enabled || st->vad_enabled || SUBMODE(forced_pitch_gain) ||
           SUBMODE(lbr_pitch) != -1)
       {
          int nol_pitch[6];
          spx_word16_t nol_pitch_coef[6];
          
-         bw_lpc(st->gamma1, st->interp_lpc, st->bw_lpc1, st->lpcSize);
-         bw_lpc(st->gamma2, st->interp_lpc, st->bw_lpc2, st->lpcSize);
+         bw_lpc(st->gamma1, interp_lpc, bw_lpc1, st->lpcSize);
+         bw_lpc(st->gamma2, interp_lpc, bw_lpc2, st->lpcSize);
          
-         filter_mem2(st->frame, st->bw_lpc1, st->bw_lpc2, st->sw, st->frameSize, st->lpcSize, st->mem_sw_whole);
+         for (i=0;i<st->windowSize-st->frameSize;i++)
+            st->sw[i] = st->winBuf[i];
+         for (;i<st->frameSize;i++)
+            st->sw[i] = in[i-st->windowSize+st->frameSize];
+         filter_mem16(st->sw, bw_lpc1, bw_lpc2, st->sw, st->frameSize, st->lpcSize, st->mem_sw_whole, stack);
 
          open_loop_nbest_pitch(st->sw, st->min_pitch, st->max_pitch, st->frameSize, 
                                nol_pitch, nol_pitch_coef, 6, stack);
@@ -412,8 +403,13 @@
          ol_pitch=0;
          ol_pitch_coef=0;
       }
+      
       /*Compute "real" excitation*/
-      fir_mem2(st->frame, st->interp_lpc, st->exc, st->frameSize, st->lpcSize, st->mem_exc);
+      for (i=0;i<st->windowSize-st->frameSize;i++)
+         st->exc[i] = st->winBuf[i];
+      for (;i<st->frameSize;i++)
+         st->exc[i] = in[i-st->windowSize+st->frameSize];
+      fir_mem16(st->exc, interp_lpc, st->exc, st->frameSize, st->lpcSize, st->mem_exc, stack);
 
       /* Compute open-loop excitation gain */
 #ifdef EPIC_48K
@@ -421,8 +417,8 @@
       {
          float ol1=0,ol2=0;
          float ol_gain2;
-         ol1 = compute_rms(st->exc, st->frameSize>>1);
-         ol2 = compute_rms(st->exc+(st->frameSize>>1), st->frameSize>>1);
+         ol1 = compute_rms16(st->exc, st->frameSize>>1);
+         ol2 = compute_rms16(st->exc+(st->frameSize>>1), st->frameSize>>1);
          ol1 *= ol1*(st->frameSize>>1);
          ol2 *= ol2*(st->frameSize>>1);
 
@@ -433,16 +429,24 @@
       
          ol_gain=SHR(sqrt(1+ol_gain2/st->frameSize),SIG_SHIFT);
 
-      } else {
+      } else
 #endif
-         ol_gain = SHL32(EXTEND32(compute_rms(st->exc, st->frameSize)),SIG_SHIFT);
-#ifdef EPIC_48K
+      {
+         spx_word16_t g = compute_rms16(st->exc, st->frameSize);
+         if (ol_pitch>0)
+            ol_gain = MULT16_16(g, MULT16_16_Q14(QCONST16(1.1,14),
+                                spx_sqrt(QCONST32(1.,28)-MULT16_32_Q15(QCONST16(.8,15),SHL32(MULT16_16(ol_pitch_coef,ol_pitch_coef),16)))));
+         else
+            ol_gain = SHL32(EXTEND32(g),SIG_SHIFT);
       }
-#endif
    }
 
 #ifdef VORBIS_PSYCHO
-   compute_curve(st->psy, st->frame-16, st->curve);
+   for(i=0;i<256-st->frameSize;i++)
+      st->psy_window[i] = st->psy_window[i+st->frameSize];
+   for(i=0;i<st->frameSize;i++)
+      st->psy_window[256-st->frameSize+i] = in[i];
+   compute_curve(st->psy, st->psy_window, st->curve);
    /*print_vec(st->curve, 128, "curve");*/
    if (st->first)
       for (i=0;i<128;i++)
@@ -454,7 +458,7 @@
    {
       float lsp_dist=0;
       for (i=0;i<st->lpcSize;i++)
-         lsp_dist += (st->old_lsp[i] - st->lsp[i])*(st->old_lsp[i] - st->lsp[i]);
+         lsp_dist += (st->old_lsp[i] - lsp[i])*(st->old_lsp[i] - lsp[i]);
       lsp_dist /= LSP_SCALING*LSP_SCALING;
       
       if (st->abr_enabled)
@@ -518,7 +522,17 @@
          }
 
          speex_encoder_ctl(state, SPEEX_SET_MODE, &mode);
-
+         if (st->vbr_max>0)
+         {
+            spx_int32_t rate;
+            speex_encoder_ctl(state, SPEEX_GET_BITRATE, &rate);
+            if (rate > st->vbr_max)
+            {
+               rate = st->vbr_max;
+               speex_encoder_ctl(state, SPEEX_SET_BITRATE, &rate);
+            }
+         }
+         
          if (st->abr_enabled)
          {
             int bitrate;
@@ -580,13 +594,11 @@
       st->first=1;
       st->bounded_pitch = 1;
 
-      /* Final signal synthesis from excitation */
-      iir_mem2(st->exc, st->interp_qlpc, st->frame, st->frameSize, st->lpcSize, st->mem_sp);
+      speex_move(st->winBuf, in+2*st->frameSize-st->windowSize, (st->windowSize-st->frameSize)*sizeof(spx_word16_t));
 
-#ifdef RESYNTH
-      for (i=0;i<st->frameSize;i++)
-         in[i]=st->frame[i];
-#endif
+      /* Clear memory (no need to really compute it) */
+      for (i=0;i<st->lpcSize;i++)
+         st->mem_sp[i] = 0;
       return 0;
 
    }
@@ -595,16 +607,16 @@
    if (st->first)
    {
       for (i=0;i<st->lpcSize;i++)
-         st->old_lsp[i] = st->lsp[i];
+         st->old_lsp[i] = lsp[i];
    }
 
 
    /*Quantize LSPs*/
 #if 1 /*0 for unquantized*/
-   SUBMODE(lsp_quant)(st->lsp, st->qlsp, st->lpcSize, bits);
+   SUBMODE(lsp_quant)(lsp, qlsp, st->lpcSize, bits);
 #else
    for (i=0;i<st->lpcSize;i++)
-     st->qlsp[i]=st->lsp[i];
+     qlsp[i]=lsp[i];
 #endif
 
 #ifdef EPIC_48K
@@ -685,22 +697,25 @@
    if (st->first)
    {
       for (i=0;i<st->lpcSize;i++)
-         st->old_qlsp[i] = st->qlsp[i];
+         st->old_qlsp[i] = qlsp[i];
    }
 
-   /* Filter response */
-   ALLOC(res, st->subframeSize, spx_sig_t);
    /* Target signal */
-   ALLOC(target, st->subframeSize, spx_sig_t);
+   ALLOC(target, st->subframeSize, spx_word16_t);
+   ALLOC(innov, st->subframeSize, spx_sig_t);
+   ALLOC(exc32, st->subframeSize, spx_word32_t);
+   ALLOC(ringing, st->subframeSize, spx_word16_t);
    ALLOC(syn_resp, st->subframeSize, spx_word16_t);
-   ALLOC(real_exc, st->subframeSize, spx_sig_t);
+   ALLOC(real_exc, st->subframeSize, spx_word16_t);
    ALLOC(mem, st->lpcSize, spx_mem_t);
 
    /* Loop on sub-frames */
    for (sub=0;sub<st->nbSubframes;sub++)
    {
       int   offset;
-      spx_sig_t *sp, *sw, *exc;
+      spx_word16_t *sw;
+      spx_word16_t *exc;
+      spx_sig_t *innov_save = NULL;
       int pitch;
       int response_bound = st->subframeSize;
 #ifdef EPIC_48K
@@ -715,25 +730,26 @@
 
       /* Offset relative to start of frame */
       offset = st->subframeSize*sub;
-      /* Original signal */
-      sp=st->frame+offset;
       /* Excitation */
       exc=st->exc+offset;
       /* Weighted signal */
       sw=st->sw+offset;
-
+      /* Pointer for saving innovation */
+      if (st->innov_save)
+         innov_save = st->innov_save+offset;
+      
       /* LSP interpolation (quantized and unquantized) */
-      lsp_interpolate(st->old_lsp, st->lsp, st->interp_lsp, st->lpcSize, sub, st->nbSubframes);
-      lsp_interpolate(st->old_qlsp, st->qlsp, st->interp_qlsp, st->lpcSize, sub, st->nbSubframes);
+      lsp_interpolate(st->old_lsp, lsp, interp_lsp, st->lpcSize, sub, st->nbSubframes);
+      lsp_interpolate(st->old_qlsp, qlsp, interp_qlsp, st->lpcSize, sub, st->nbSubframes);
 
       /* Make sure the filters are stable */
-      lsp_enforce_margin(st->interp_lsp, st->lpcSize, LSP_MARGIN);
-      lsp_enforce_margin(st->interp_qlsp, st->lpcSize, LSP_MARGIN);
+      lsp_enforce_margin(interp_lsp, st->lpcSize, LSP_MARGIN);
+      lsp_enforce_margin(interp_qlsp, st->lpcSize, LSP_MARGIN);
 
       /* Compute interpolated LPCs (quantized and unquantized) */
-      lsp_to_lpc(st->interp_lsp, st->interp_lpc, st->lpcSize,stack);
+      lsp_to_lpc(interp_lsp, interp_lpc, st->lpcSize,stack);
 
-      lsp_to_lpc(st->interp_qlsp, st->interp_qlpc, st->lpcSize, stack);
+      lsp_to_lpc(interp_qlsp, interp_qlpc, st->lpcSize, stack);
 
       /* Compute analysis filter gain at w=pi (for use in SB-CELP) */
       {
@@ -741,7 +757,7 @@
          for (i=0;i<st->lpcSize;i+=2)
          {
             /*pi_g += -st->interp_qlpc[i] +  st->interp_qlpc[i+1];*/
-            pi_g = ADD32(pi_g, SUB32(st->interp_qlpc[i+1],st->interp_qlpc[i]));
+            pi_g = ADD32(pi_g, SUB32(EXTEND32(interp_qlpc[i+1]),EXTEND32(interp_qlpc[i])));
          }
          st->pi_gain[sub] = pi_g;
       }
@@ -752,56 +768,66 @@
          float fact = ((float)sub+1.0f)/st->nbSubframes;
          for (i=0;i<128;i++)
             curr_curve[i] = (1.0f-fact)*st->old_curve[i] + fact*st->curve[i];
-         curve_to_lpc(st->psy, curr_curve, st->bw_lpc1, st->bw_lpc2, 10);
+         curve_to_lpc(st->psy, curr_curve, bw_lpc1, bw_lpc2, 10);
       }
 #else
       /* Compute bandwidth-expanded (unquantized) LPCs for perceptual weighting */
-      bw_lpc(st->gamma1, st->interp_lpc, st->bw_lpc1, st->lpcSize);
+      bw_lpc(st->gamma1, interp_lpc, bw_lpc1, st->lpcSize);
       if (st->gamma2>=0)
-         bw_lpc(st->gamma2, st->interp_lpc, st->bw_lpc2, st->lpcSize);
+         bw_lpc(st->gamma2, interp_lpc, bw_lpc2, st->lpcSize);
       else
       {
-         st->bw_lpc2[0]=1;
+         bw_lpc2[0]=1;
          for (i=1;i<=st->lpcSize;i++)
-            st->bw_lpc2[i]=0;
+            bw_lpc2[i]=0;
       }
       /*print_vec(st->bw_lpc1, 10, "bw_lpc");*/
 #endif
 
-      for (i=0;i<st->subframeSize;i++)
-         real_exc[i] = exc[i];
+      {
+         /*FIXME: This will break if we change the window size */
+         if (st->windowSize-st->frameSize != st->subframeSize)
+            speex_error("windowSize-frameSize != subframeSize");
+         if (sub==0)
+         {
+            for (i=0;i<st->subframeSize;i++)
+               real_exc[i] = sw[i] = st->winBuf[i];
+         } else {
+            for (i=0;i<st->subframeSize;i++)
+               real_exc[i] = sw[i] = in[i+((sub-1)*st->subframeSize)];
+         }
+      }
+      fir_mem16(real_exc, interp_qlpc, real_exc, st->subframeSize, st->lpcSize, st->mem_exc2, stack);
       
       if (st->complexity==0)
          response_bound >>= 1;
-      compute_impulse_response(st->interp_qlpc, st->bw_lpc1, st->bw_lpc2, syn_resp, response_bound, st->lpcSize, stack);
+      compute_impulse_response(interp_qlpc, bw_lpc1, bw_lpc2, syn_resp, response_bound, st->lpcSize, stack);
       for (i=response_bound;i<st->subframeSize;i++)
          syn_resp[i]=VERY_SMALL;
       
-      /* Reset excitation */
-      for (i=0;i<st->subframeSize;i++)
-         exc[i]=VERY_SMALL;
-
       /* Compute zero response of A(z/g1) / ( A(z/g2) * A(z) ) */
       for (i=0;i<st->lpcSize;i++)
-         mem[i]=st->mem_sp[i];
+         mem[i]=SHL32(st->mem_sp[i],1);
+      for (i=0;i<st->subframeSize;i++)
+         ringing[i] = VERY_SMALL;
 #ifdef SHORTCUTS2
-      iir_mem2(exc, st->interp_qlpc, exc, response_bound, st->lpcSize, mem);
+      iir_mem16(ringing, interp_qlpc, ringing, response_bound, st->lpcSize, mem, stack);
       for (i=0;i<st->lpcSize;i++)
-         mem[i]=st->mem_sw[i];
-      filter_mem2(exc, st->bw_lpc1, st->bw_lpc2, res, response_bound, st->lpcSize, mem);
+         mem[i]=SHL32(st->mem_sw[i],1);
+      filter_mem16(ringing, st->bw_lpc1, st->bw_lpc2, ringing, response_bound, st->lpcSize, mem, stack);
       for (i=response_bound;i<st->subframeSize;i++)
-         res[i]=0;
+         ringing[i]=0;
 #else
-      iir_mem2(exc, st->interp_qlpc, exc, st->subframeSize, st->lpcSize, mem);
+      iir_mem16(ringing, interp_qlpc, ringing, st->subframeSize, st->lpcSize, mem, stack);
       for (i=0;i<st->lpcSize;i++)
-         mem[i]=st->mem_sw[i];
-      filter_mem2(exc, st->bw_lpc1, st->bw_lpc2, res, st->subframeSize, st->lpcSize, mem);
+         mem[i]=SHL32(st->mem_sw[i],1);
+      filter_mem16(ringing, bw_lpc1, bw_lpc2, ringing, st->subframeSize, st->lpcSize, mem, stack);
 #endif
       
       /* Compute weighted signal */
       for (i=0;i<st->lpcSize;i++)
          mem[i]=st->mem_sw[i];
-      filter_mem2(sp, st->bw_lpc1, st->bw_lpc2, sw, st->subframeSize, st->lpcSize, mem);
+      filter_mem16(sw, bw_lpc1, bw_lpc2, sw, st->subframeSize, st->lpcSize, mem, stack);
       
       if (st->complexity==0)
          for (i=0;i<st->lpcSize;i++)
@@ -809,8 +835,9 @@
       
       /* Compute target signal */
       for (i=0;i<st->subframeSize;i++)
-         target[i]=sw[i]-res[i];
+         target[i]=SUB16(sw[i],PSHR32(ringing[i],1));
 
+      /* Reset excitation */
       for (i=0;i<st->subframeSize;i++)
          exc[i]=0;
 
@@ -847,18 +874,18 @@
 #ifdef EPIC_48K
          if (st->lbr_48k)
          {
-            pitch = SUBMODE(ltp_quant)(target, sw, st->interp_qlpc, st->bw_lpc1, st->bw_lpc2,
-                                       exc, SUBMODE(ltp_params), pit_min, pit_max, ol_pitch_coef,
+            pitch = SUBMODE(ltp_quant)(target, sw, interp_qlpc, bw_lpc1, bw_lpc2,
+                                       exc32, SUBMODE(ltp_params), pit_min, pit_max, ol_pitch_coef,
                                        st->lpcSize, st->subframeSize, bits, stack, 
-                                       exc, syn_resp, st->complexity, ol_pitch_id, st->plc_tuning);
+                                       exc, syn_resp, st->complexity, ol_pitch_id, st->plc_tuning, &st->cumul_gain);
          } else {
 #endif
 
          /* Perform pitch search */
-         pitch = SUBMODE(ltp_quant)(target, sw, st->interp_qlpc, st->bw_lpc1, st->bw_lpc2,
-                                    exc, SUBMODE(ltp_params), pit_min, pit_max, ol_pitch_coef,
+         pitch = SUBMODE(ltp_quant)(target, sw, interp_qlpc, bw_lpc1, bw_lpc2,
+                                    exc32, SUBMODE(ltp_params), pit_min, pit_max, ol_pitch_coef,
                                     st->lpcSize, st->subframeSize, bits, stack, 
-                                    exc, syn_resp, st->complexity, 0, st->plc_tuning);
+                                    exc, syn_resp, st->complexity, 0, st->plc_tuning, &st->cumul_gain);
 #ifdef EPIC_48K
          }
 #endif
@@ -870,30 +897,28 @@
 
       /* Quantization of innovation */
       {
-         spx_sig_t *innov;
          spx_word32_t ener=0;
          spx_word16_t fine_gain;
 
-         innov = st->innov+sub*st->subframeSize;
          for (i=0;i<st->subframeSize;i++)
             innov[i]=0;
          
          for (i=0;i<st->subframeSize;i++)
-            real_exc[i] = SUB32(real_exc[i], exc[i]);
+            real_exc[i] = SUB16(real_exc[i], PSHR32(exc32[i],SIG_SHIFT-1));
 
-         ener = SHL32(EXTEND32(compute_rms(real_exc, st->subframeSize)),SIG_SHIFT);
+         ener = SHL32(EXTEND32(compute_rms16(real_exc, st->subframeSize)),SIG_SHIFT);
          
          /*FIXME: Should use DIV32_16 and make sure result fits in 16 bits */
 #ifdef FIXED_POINT
          {
-            spx_word32_t f = DIV32(ener,PSHR32(ol_gain,SIG_SHIFT));
+            spx_word32_t f = PDIV32(ener,PSHR32(ol_gain,SIG_SHIFT));
             if (f<=32767)
                fine_gain = f;
             else
                fine_gain = 32767;
          }
 #else
-         fine_gain = DIV32_16(ener,PSHR32(ol_gain,SIG_SHIFT));
+         fine_gain = PDIV32_16(ener,PSHR32(ol_gain,SIG_SHIFT));
 #endif
          /* Calculate gain correction for the sub-frame (if any) */
          if (SUBMODE(have_subframe_gain)) 
@@ -922,7 +947,7 @@
          if (SUBMODE(innovation_quant))
          {
             /* Codebook search */
-            SUBMODE(innovation_quant)(target, st->interp_qlpc, st->bw_lpc1, st->bw_lpc2, 
+            SUBMODE(innovation_quant)(target, interp_qlpc, bw_lpc1, bw_lpc2, 
                                       SUBMODE(innovation_params), st->lpcSize, st->subframeSize, 
                                       innov, syn_resp, bits, stack, st->complexity, SUBMODE(double_codebook));
             
@@ -930,11 +955,16 @@
             signal_mul(innov, innov, ener, st->subframeSize);
 
             for (i=0;i<st->subframeSize;i++)
-               exc[i] = ADD32(exc[i],innov[i]);
+               exc[i] = EXTRACT16(PSHR32(ADD32(SHL32(exc32[i],1),innov[i]),SIG_SHIFT));
          } else {
             speex_error("No fixed codebook");
          }
 
+         if (innov_save)
+         {
+            for (i=0;i<st->subframeSize;i++)
+               innov_save[i] = innov[i];
+         }
          /* In some (rare) modes, we do a second search (more bits) to reduce noise even more */
          if (SUBMODE(double_codebook)) {
             char *tmp_stack=stack;
@@ -943,24 +973,31 @@
             for (i=0;i<st->subframeSize;i++)
                innov2[i]=0;
             for (i=0;i<st->subframeSize;i++)
-               target[i]*=2.2;
-            SUBMODE(innovation_quant)(target, st->interp_qlpc, st->bw_lpc1, st->bw_lpc2, 
+               target[i]=MULT16_16_P13(QCONST16(2.2,13), target[i]);
+            SUBMODE(innovation_quant)(target, interp_qlpc, bw_lpc1, bw_lpc2, 
                                       SUBMODE(innovation_params), st->lpcSize, st->subframeSize, 
                                       innov2, syn_resp, bits, stack, st->complexity, 0);
-            signal_mul(innov2, innov2, (spx_word32_t) (ener*(1.f/2.2f)), st->subframeSize);
+            signal_mul(innov2, innov2, MULT16_32_Q15(QCONST16(0.454545,15),ener), st->subframeSize);
             for (i=0;i<st->subframeSize;i++)
-               exc[i] = ADD32(exc[i],innov2[i]);
+               exc[i] = ADD32(exc[i],PSHR32(innov2[i],SIG_SHIFT));
+            if (innov_save)
+            {
+               for (i=0;i<st->subframeSize;i++)
+                  innov_save[i] = ADD32(innov_save[i],innov2[i]);
+            }
             stack = tmp_stack;
          }
 
       }
 
+      for (i=0;i<st->subframeSize;i++)
+         sw[i] = exc[i];
       /* Final signal synthesis from excitation */
-      iir_mem2(exc, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, st->mem_sp);
+      iir_mem16(sw, interp_qlpc, sw, st->subframeSize, st->lpcSize, st->mem_sp, stack);
 
       /* Compute weighted signal again, from synthesized speech (not sure it's the right thing) */
       if (st->complexity!=0)
-         filter_mem2(sp, st->bw_lpc1, st->bw_lpc2, sw, st->subframeSize, st->lpcSize, st->mem_sw);
+         filter_mem16(sw, bw_lpc1, bw_lpc2, sw, st->subframeSize, st->lpcSize, st->mem_sw, stack);
       
    }
 
@@ -968,9 +1005,9 @@
    if (st->submodeID>=1)
    {
       for (i=0;i<st->lpcSize;i++)
-         st->old_lsp[i] = st->lsp[i];
+         st->old_lsp[i] = lsp[i];
       for (i=0;i<st->lpcSize;i++)
-         st->old_qlsp[i] = st->qlsp[i];
+         st->old_qlsp[i] = qlsp[i];
    }
 
 #ifdef VORBIS_PSYCHO
@@ -991,19 +1028,7 @@
 
    /* The next frame will not be the first (Duh!) */
    st->first = 0;
-
-#ifdef RESYNTH
-   /* Replace input by synthesized speech */
-   for (i=0;i<st->frameSize;i++)
-   {
-      spx_word32_t sig = PSHR32(st->frame[i],SIG_SHIFT);
-      if (sig>32767)
-         sig = 32767;
-      if (sig<-32767)
-         sig = -32767;
-     in[i]=sig;
-   }
-#endif
+   speex_move(st->winBuf, in+2*st->frameSize-st->windowSize, (st->windowSize-st->frameSize)*sizeof(spx_word16_t));
 
    if (SUBMODE(innovation_quant) == noise_codebook_quant || st->submodeID==0)
       st->bounded_pitch = 1;
@@ -1013,7 +1038,6 @@
    return 1;
 }
 
-
 void *nb_decoder_init(const SpeexMode *m)
 {
    DecState *st;
@@ -1050,27 +1074,16 @@
    st->submodes=mode->submodes;
    st->submodeID=mode->defaultSubmode;
 
-   st->lpc_enh_enabled=0;
+   st->lpc_enh_enabled=1;
 
-
-   st->inBuf = speex_alloc((st->frameSize)*sizeof(spx_sig_t));
-   st->frame = st->inBuf;
-   st->excBuf = speex_alloc((st->frameSize + st->max_pitch + 1)*sizeof(spx_sig_t));
-   st->exc = st->excBuf + st->max_pitch + 1;
-   for (i=0;i<st->frameSize;i++)
-      st->inBuf[i]=0;
+   st->excBuf = speex_alloc((st->frameSize + 2*st->max_pitch + st->subframeSize + 12)*sizeof(spx_word16_t));
+   st->exc = st->excBuf + 2*st->max_pitch + st->subframeSize + 6;
    for (i=0;i<st->frameSize + st->max_pitch + 1;i++)
       st->excBuf[i]=0;
-   st->innov = speex_alloc((st->frameSize)*sizeof(spx_sig_t));
 
    st->interp_qlpc = speex_alloc(st->lpcSize*sizeof(spx_coef_t));
-   st->qlsp = speex_alloc(st->lpcSize*sizeof(spx_lsp_t));
    st->old_qlsp = speex_alloc(st->lpcSize*sizeof(spx_lsp_t));
-   st->interp_qlsp = speex_alloc(st->lpcSize*sizeof(spx_lsp_t));
-   st->mem_sp = speex_alloc((5*st->lpcSize)*sizeof(spx_mem_t));
-   st->comb_mem = speex_alloc(sizeof(CombFilterMem));
-   comb_filter_mem_init (st->comb_mem);
-
+   st->mem_sp = speex_alloc(st->lpcSize*sizeof(spx_mem_t));
    st->pi_gain = speex_alloc((st->nbSubframes)*sizeof(spx_word32_t));
    st->last_pitch = 40;
    st->count_lost=0;
@@ -1104,15 +1117,10 @@
    speex_free_scratch(st->stack);
 #endif
 
-   speex_free (st->inBuf);
    speex_free (st->excBuf);
-   speex_free (st->innov);
    speex_free (st->interp_qlpc);
-   speex_free (st->qlsp);
    speex_free (st->old_qlsp);
-   speex_free (st->interp_qlsp);
    speex_free (st->mem_sp);
-   speex_free (st->comb_mem);
    speex_free (st->pi_gain);
 
    speex_free(state);
@@ -1131,9 +1139,6 @@
 {
    int i, sub;
    int pitch_val;
-   VARDECL(spx_coef_t *awk1);
-   VARDECL(spx_coef_t *awk2);
-   VARDECL(spx_coef_t *awk3);
    spx_word16_t pitch_gain;
    spx_word16_t fact;
    spx_word16_t gain_med;
@@ -1162,48 +1167,27 @@
    pitch_gain = MULT16_16_Q15(fact,pitch_gain) + VERY_SMALL;
 
    /* Shift all buffers by one frame */
-   /*speex_move(st->inBuf, st->inBuf+st->frameSize, (st->bufSize-st->frameSize)*sizeof(spx_sig_t));*/
-   speex_move(st->excBuf, st->excBuf+st->frameSize, (st->max_pitch + 1)*sizeof(spx_sig_t));
-
-   ALLOC(awk1, (st->lpcSize+1), spx_coef_t);
-   ALLOC(awk2, (st->lpcSize+1), spx_coef_t);
-   ALLOC(awk3, (st->lpcSize+1), spx_coef_t);
-
+   speex_move(st->excBuf, st->excBuf+st->frameSize, (2*st->max_pitch + st->subframeSize + 12)*sizeof(spx_word16_t));
    for (sub=0;sub<st->nbSubframes;sub++)
    {
       int offset;
-      spx_sig_t *sp, *exc;
+      spx_word16_t *sp;
+      spx_word16_t *exc;
       /* Offset relative to start of frame */
       offset = st->subframeSize*sub;
       /* Original signal */
-      sp=st->frame+offset;
+      sp=out+offset;
       /* Excitation */
       exc=st->exc+offset;
       /* Excitation after post-filter*/
-
-      /* Calculate perceptually enhanced LPC filter */
-      if (st->lpc_enh_enabled)
-      {
-         spx_word16_t k1,k2,k3;
-         if (st->submodes[st->submodeID] != NULL)
-         {
-            k1=SUBMODE(lpc_enh_k1);
-            k2=SUBMODE(lpc_enh_k2);
-            k3=SUBMODE(lpc_enh_k3);
-         } else {
-            k1=k2=.7*GAMMA_SCALING;
-            k3=.0;
-         }
-         bw_lpc(k1, st->interp_qlpc, awk1, st->lpcSize);
-         bw_lpc(k2, st->interp_qlpc, awk2, st->lpcSize);
-         bw_lpc(k3, st->interp_qlpc, awk3, st->lpcSize);
-      }
         
       /* Make up a plausible excitation */
       /* FIXME: THIS CAN BE IMPROVED */
       /*if (pitch_gain>.95)
         pitch_gain=.95;*/
-      innov_gain = compute_rms(st->innov, st->frameSize);
+      
+      /* FIXME: This was rms of innovation (not exc) */
+      innov_gain = compute_rms16(st->exc, st->frameSize);
       pitch_val = st->last_pitch + SHR32((spx_int32_t)speex_rand(1+st->count_lost, &st->seed),SIG_SHIFT);
       if (pitch_val > st->max_pitch)
          pitch_val = st->max_pitch;
@@ -1211,36 +1195,16 @@
          pitch_val = st->min_pitch;
       for (i=0;i<st->subframeSize;i++)
       {
-         exc[i]= MULT16_32_Q15(pitch_gain, (exc[i-pitch_val]+VERY_SMALL)) + 
-               MULT16_32_Q15(fact, MULT16_32_Q15(SHL(Q15ONE,15)-SHL(MULT16_16(pitch_gain,pitch_gain),1),speex_rand(innov_gain, &st->seed)));
+         /* FIXME: Second term need to be 16-bit */
+         exc[i]= MULT16_16_Q15(pitch_gain, (exc[i-pitch_val]+VERY_SMALL)) + 
+               MULT16_16_Q15(fact, MULT16_16_Q15(SHL(Q15ONE,15)-SHL(MULT16_16(pitch_gain,pitch_gain),1),speex_rand(innov_gain, &st->seed)));
       }
-      
       for (i=0;i<st->subframeSize;i++)
-         sp[i]=exc[i];
-      
-      /* Signal synthesis */
-      if (st->lpc_enh_enabled)
-      {
-         filter_mem2(sp, awk2, awk1, sp, st->subframeSize, st->lpcSize, 
-                     st->mem_sp+st->lpcSize);
-         filter_mem2(sp, awk3, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, 
-                     st->mem_sp);
-      } else {
-         for (i=0;i<st->lpcSize;i++)
-            st->mem_sp[st->lpcSize+i] = 0;
-         iir_mem2(sp, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, 
-                     st->mem_sp);
-      }      
-   }
+         sp[i]=exc[i-st->subframeSize];
+      iir_mem16(sp, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, 
+                st->mem_sp, stack);
 
-   for (i=0;i<st->frameSize;i++)
-   {
-      spx_word32_t sig = PSHR32(st->frame[i],SIG_SHIFT);
-      if (sig>32767)
-         sig = 32767;
-      if (sig<-32767)
-         sig = -32767;
-     out[i]=sig;
+      bw_lpc(QCONST16(.98,15), st->interp_qlpc, st->interp_qlpc, st->lpcSize);
    }
    
    st->first = 0;
@@ -1250,6 +1214,7 @@
       st->pitch_gain_buf_idx = 0;
 }
 
+
 int nb_decode(void *state, SpeexBits *bits, void *vout)
 {
    DecState *st;
@@ -1264,15 +1229,17 @@
    int wideband;
    int m;
    char *stack;
-   VARDECL(spx_coef_t *awk1);
-   VARDECL(spx_coef_t *awk2);
-   VARDECL(spx_coef_t *awk3);
+   VARDECL(spx_sig_t *innov);
+   VARDECL(spx_word32_t *exc32);
+   VARDECL(spx_coef_t *ak);
+   VARDECL(spx_lsp_t *qlsp);
    spx_word16_t pitch_average=0;
 #ifdef EPIC_48K
    int pitch_half[2];
    int ol_pitch_id=0;
 #endif
    spx_word16_t *out = vout;
+   VARDECL(spx_lsp_t *interp_qlsp);
 
    st=(DecState*)state;
    stack=st->stack;
@@ -1373,7 +1340,7 @@
    }
 
    /* Shift all buffers by one frame */
-   speex_move(st->excBuf, st->excBuf+st->frameSize, (st->max_pitch + 1)*sizeof(spx_sig_t));
+   speex_move(st->excBuf, st->excBuf+st->frameSize, (2*st->max_pitch + st->subframeSize + 12)*sizeof(spx_word16_t));
 
    /* If null mode (no transmission), just set a couple things to zero*/
    if (st->submodes[st->submodeID] == NULL)
@@ -1386,34 +1353,28 @@
          float pgain=GAIN_SCALING_1*st->last_pitch_gain;
          if (pgain>.6)
             pgain=.6;
-	 innov_gain = compute_rms(st->innov, st->frameSize);
+         /* FIXME: This was innov, not exc */
+         innov_gain = compute_rms16(st->exc, st->frameSize);
          for (i=0;i<st->frameSize;i++)
-            st->exc[i]=VERY_SMALL;
-         speex_rand_vec(innov_gain, st->exc, st->frameSize);
+            st->exc[i]=speex_rand(innov_gain, &st->seed);
       }
 
 
       st->first=1;
 
-      /* Final signal synthesis from excitation */
-      iir_mem2(st->exc, lpc, st->frame, st->frameSize, st->lpcSize, st->mem_sp);
-
       for (i=0;i<st->frameSize;i++)
-      {
-         spx_word32_t sig = PSHR32(st->frame[i],SIG_SHIFT);
-         if (sig>32767)
-            sig = 32767;
-         if (sig<-32767)
-            sig = -32767;
-         out[i]=sig;
-      }
+         out[i] = st->exc[i];
+      /* Final signal synthesis from excitation */
+      iir_mem16(out, lpc, out, st->frameSize, st->lpcSize, st->mem_sp, stack);
 
       st->count_lost=0;
       return 0;
    }
 
+   ALLOC(qlsp, st->lpcSize, spx_lsp_t);
+
    /* Unquantize LSPs */
-   SUBMODE(lsp_unquant)(st->qlsp, st->lpcSize, bits);
+   SUBMODE(lsp_unquant)(qlsp, st->lpcSize, bits);
 
    /*Damp memory if a frame was lost and the LSP changed too much*/
    if (st->count_lost)
@@ -1421,13 +1382,13 @@
       spx_word16_t fact;
       spx_word32_t lsp_dist=0;
       for (i=0;i<st->lpcSize;i++)
-         lsp_dist = ADD32(lsp_dist, EXTEND32(ABS(st->old_qlsp[i] - st->qlsp[i])));
+         lsp_dist = ADD32(lsp_dist, EXTEND32(ABS(st->old_qlsp[i] - qlsp[i])));
 #ifdef FIXED_POINT
       fact = SHR16(19661,SHR32(lsp_dist,LSP_SHIFT+2));      
 #else
       fact = .6*exp(-.2*lsp_dist);
 #endif
-      for (i=0;i<2*st->lpcSize;i++)
+      for (i=0;i<st->lpcSize;i++)
          st->mem_sp[i] = MULT16_32_Q15(fact,st->mem_sp[i]);
    }
 
@@ -1436,7 +1397,7 @@
    if (st->first || st->count_lost)
    {
       for (i=0;i<st->lpcSize;i++)
-         st->old_qlsp[i] = st->qlsp[i];
+         st->old_qlsp[i] = qlsp[i];
    }
 
 #ifdef EPIC_48K
@@ -1483,9 +1444,9 @@
    }
 #endif
 
-   ALLOC(awk1, st->lpcSize+1, spx_coef_t);
-   ALLOC(awk2, st->lpcSize+1, spx_coef_t);
-   ALLOC(awk3, st->lpcSize+1, spx_coef_t);
+   ALLOC(ak, st->lpcSize, spx_coef_t);
+   ALLOC(innov, st->subframeSize, spx_sig_t);
+   ALLOC(exc32, st->subframeSize, spx_word32_t);
 
    if (st->submodeID==1)
    {
@@ -1504,7 +1465,9 @@
    for (sub=0;sub<st->nbSubframes;sub++)
    {
       int offset;
-      spx_sig_t *sp, *exc;
+      spx_word16_t *exc;
+      spx_word16_t *sp;
+      spx_sig_t *innov_save = NULL;
       spx_word16_t tmp;
 
 #ifdef EPIC_48K
@@ -1519,40 +1482,13 @@
 
       /* Offset relative to start of frame */
       offset = st->subframeSize*sub;
-      /* Original signal */
-      sp=st->frame+offset;
       /* Excitation */
       exc=st->exc+offset;
-      /* Excitation after post-filter*/
+      /* Original signal */
+      sp=out+offset;
+      if (st->innov_save)
+         innov_save = st->innov_save+offset;
 
-      /* LSP interpolation (quantized and unquantized) */
-      lsp_interpolate(st->old_qlsp, st->qlsp, st->interp_qlsp, st->lpcSize, sub, st->nbSubframes);
-
-      /* Make sure the LSP's are stable */
-      lsp_enforce_margin(st->interp_qlsp, st->lpcSize, LSP_MARGIN);
-
-
-      /* Compute interpolated LPCs (unquantized) */
-      lsp_to_lpc(st->interp_qlsp, st->interp_qlpc, st->lpcSize, stack);
-
-      /* Compute enhanced synthesis filter */
-      if (st->lpc_enh_enabled)
-      {
-         bw_lpc(SUBMODE(lpc_enh_k1), st->interp_qlpc, awk1, st->lpcSize);
-         bw_lpc(SUBMODE(lpc_enh_k2), st->interp_qlpc, awk2, st->lpcSize);
-         bw_lpc(SUBMODE(lpc_enh_k3), st->interp_qlpc, awk3, st->lpcSize);
-      }
-
-      /* Compute analysis filter at w=pi */
-      {
-         spx_word32_t pi_g=LPC_SCALING;
-         for (i=0;i<st->lpcSize;i+=2)
-         {
-            /*pi_g += -st->interp_qlpc[i] +  st->interp_qlpc[i+1];*/
-            pi_g = ADD32(pi_g, SUB32(st->interp_qlpc[i+1],st->interp_qlpc[i]));
-         }
-         st->pi_gain[sub] = pi_g;
-      }
 
       /* Reset excitation */
       for (i=0;i<st->subframeSize;i++)
@@ -1595,13 +1531,13 @@
 #ifdef EPIC_48K
          if (st->lbr_48k)
          {
-             SUBMODE(ltp_unquant)(exc, pit_min, pit_max, ol_pitch_coef, SUBMODE(ltp_params), 
+             SUBMODE(ltp_unquant)(exc, exc32, pit_min, pit_max, ol_pitch_coef, SUBMODE(ltp_params), 
                                   st->subframeSize, &pitch, &pitch_gain[0], bits, stack, 
                                   st->count_lost, offset, st->last_pitch_gain, ol_pitch_id);
          } else {
 #endif
 
-             SUBMODE(ltp_unquant)(exc, pit_min, pit_max, ol_pitch_coef, SUBMODE(ltp_params), 
+             SUBMODE(ltp_unquant)(exc, exc32, pit_min, pit_max, ol_pitch_coef, SUBMODE(ltp_params), 
                                   st->subframeSize, &pitch, &pitch_gain[0], bits, stack, 
                                   st->count_lost, offset, st->last_pitch_gain, 0);
 
@@ -1609,25 +1545,16 @@
          }
 #endif
 
-         
-         /* If we had lost frames, check energy of last received frame */
-         if (st->count_lost && ol_gain < st->last_ol_gain)
-         {
-            /*float fact = (float)ol_gain/(st->last_ol_gain+1);
-            for (i=0;i<st->subframeSize;i++)
-            exc[i]*=fact;*/
-            spx_word16_t fact = DIV32_16(SHL32(EXTEND32(ol_gain),15),st->last_ol_gain+1);
-            for (i=0;i<st->subframeSize;i++)
-               exc[i] = MULT16_32_Q15(fact, exc[i]);
-         }
-
          tmp = gain_3tap_to_1tap(pitch_gain);
 
          pitch_average += tmp;
-         if (tmp>best_pitch_gain)
+         if ((tmp>best_pitch_gain&&ABS(2*best_pitch-pitch)>=3&&ABS(3*best_pitch-pitch)>=4&&ABS(4*best_pitch-pitch)>=5) 
+              || (tmp>MULT16_16_Q15(QCONST16(.6,15),best_pitch_gain)&&(ABS(best_pitch-2*pitch)<3||ABS(best_pitch-3*pitch)<4||ABS(best_pitch-4*pitch)<5)) 
+              || (MULT16_16_Q15(QCONST16(.67,15),tmp)>best_pitch_gain&&(ABS(2*best_pitch-pitch)<3||ABS(3*best_pitch-pitch)<4||ABS(4*best_pitch-pitch)<5)) )
          {
             best_pitch = pitch;
-	    best_pitch_gain = tmp;
+            if (tmp > best_pitch_gain)
+               best_pitch_gain = tmp;
          }
       } else {
          speex_error("No pitch prediction, what's wrong");
@@ -1637,9 +1564,7 @@
       {
          int q_energy;
          spx_word32_t ener;
-         spx_sig_t *innov;
          
-         innov = st->innov+sub*st->subframeSize;
          for (i=0;i<st->subframeSize;i++)
             innov[i]=0;
 
@@ -1681,7 +1606,7 @@
             while (st->voc_offset<st->subframeSize)
             {
                if (st->voc_offset>=0)
-                  exc[st->voc_offset]=SIG_SCALING*sqrt(1.0*ol_pitch);
+                  exc[st->voc_offset]=sqrt(1.0*ol_pitch);
                st->voc_offset+=ol_pitch;
             }
             st->voc_offset -= st->subframeSize;
@@ -1693,8 +1618,9 @@
                g=1;
             for (i=0;i<st->subframeSize;i++)
             {
-               float exci=exc[i];
-               exc[i]=.8*g*exc[i]*ol_gain/SIG_SCALING + .6*g*st->voc_m1*ol_gain/SIG_SCALING + .5*g*innov[i] - .5*g*st->voc_m2 + (1-g)*innov[i];
+               spx_word16_t exci=exc[i];
+               /* FIXME: cleanup the innov[i]/SIG_SCALING */
+               exc[i]=.8*g*exc[i]*PSHR32(ol_gain,SIG_SHIFT) + .6*g*st->voc_m1*PSHR32(ol_gain,SIG_SHIFT) + (1-.5*g)*PSHR32(innov[i],SIG_SHIFT) - .5*g*PSHR32(st->voc_m2,SIG_SHIFT);
                st->voc_m1 = exci;
                st->voc_m2=innov[i];
                st->voc_mean = .95*st->voc_mean + .05*exc[i];
@@ -1702,9 +1628,14 @@
             }
          } else {
             for (i=0;i<st->subframeSize;i++)
-               exc[i]=ADD32(exc[i],innov[i]);
+               exc[i]=PSHR32(ADD32(SHL32(exc32[i],1),innov[i]),SIG_SHIFT);
             /*print_vec(exc, 40, "innov");*/
          }
+         if (innov_save)
+         {
+            for (i=0;i<st->subframeSize;i++)
+               innov_save[i] = innov[i];
+         }
          /* Decode second codebook (only for some modes) */
          if (SUBMODE(double_codebook))
          {
@@ -1714,68 +1645,93 @@
             for (i=0;i<st->subframeSize;i++)
                innov2[i]=0;
             SUBMODE(innovation_unquant)(innov2, SUBMODE(innovation_params), st->subframeSize, bits, stack);
-            signal_mul(innov2, innov2, (spx_word32_t) (ener*(1/2.2)), st->subframeSize);
+            signal_mul(innov2, innov2, MULT16_32_Q15(QCONST16(0.454545,15),ener), st->subframeSize);
             for (i=0;i<st->subframeSize;i++)
-               exc[i] = ADD32(exc[i],innov2[i]);
+               exc[i] = ADD16(exc[i],PSHR32(innov2[i],SIG_SHIFT));
+            if (innov_save)
+            {
+               for (i=0;i<st->subframeSize;i++)
+                  innov_save[i] = ADD32(innov_save[i],innov2[i]);
+            }
             stack = tmp_stack;
          }
-
-      }
-
-      /* If the last packet was lost, re-scale the excitation to obtain the same energy as encoded in ol_gain */
-      if (st->count_lost) 
-      {
-         spx_word16_t exc_ener;
-         spx_word32_t gain32;
-         spx_word16_t gain;
-         exc_ener = compute_rms (exc, st->subframeSize);
-         gain32 = DIV32(ol_gain, ADD16(exc_ener,1));
-#ifdef FIXED_POINT
-         if (gain32 > 32768)
-            gain32 = 32768;
-         gain = EXTRACT16(gain32);
-#else
-         if (gain32 > 2)
-            gain32=2;
-         gain = gain32;
-#endif
-         for (i=0;i<st->subframeSize;i++)
-            exc[i] = MULT16_32_Q14(gain, exc[i]);
-      }
-
-      for (i=0;i<st->subframeSize;i++)
-         sp[i]=exc[i];
-
-      /* Signal synthesis */
-      if (st->lpc_enh_enabled && SUBMODE(comb_gain)>0)
-         comb_filter(exc, sp, st->interp_qlpc, st->lpcSize, st->subframeSize,
-                              pitch, pitch_gain, SUBMODE(comb_gain), st->comb_mem);
-
-      if (st->lpc_enh_enabled)
-      {
-         /* Use enhanced LPC filter */
-         filter_mem2(sp, awk2, awk1, sp, st->subframeSize, st->lpcSize, 
-                     st->mem_sp+st->lpcSize);
-         filter_mem2(sp, awk3, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, 
-                     st->mem_sp);
-      } else {
-         /* Use regular filter */
-         for (i=0;i<st->lpcSize;i++)
-            st->mem_sp[st->lpcSize+i] = 0;
-         iir_mem2(sp, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, 
-                     st->mem_sp);
       }
    }
    
-   /*Copy output signal*/   
-   for (i=0;i<st->frameSize;i++)
+   ALLOC(interp_qlsp, st->lpcSize, spx_lsp_t);
+
+   if (st->lpc_enh_enabled && SUBMODE(comb_gain)>0 && !st->count_lost)
    {
-      spx_word32_t sig = PSHR32(st->frame[i],SIG_SHIFT);
-      if (sig>32767)
-         sig = 32767;
-      if (sig<-32767)
-         sig = -32767;
-     out[i]=sig;
+      multicomb(st->exc-st->subframeSize, out, st->interp_qlpc, st->lpcSize, 2*st->subframeSize, best_pitch, 40, SUBMODE(comb_gain), stack);
+      multicomb(st->exc+st->subframeSize, out+2*st->subframeSize, st->interp_qlpc, st->lpcSize, 2*st->subframeSize, best_pitch, 40, SUBMODE(comb_gain), stack);
+   } else {
+      for (i=0;i<st->frameSize;i++)
+         out[i]=st->exc[i-st->subframeSize];
+   }
+   
+   /* If the last packet was lost, re-scale the excitation to obtain the same energy as encoded in ol_gain */
+   if (st->count_lost) 
+   {
+      spx_word16_t exc_ener;
+      spx_word32_t gain32;
+      spx_word16_t gain;
+      exc_ener = compute_rms16 (st->exc, st->frameSize);
+      gain32 = PDIV32(ol_gain, ADD16(exc_ener,1));
+#ifdef FIXED_POINT
+      if (gain32 > 32768)
+         gain32 = 32768;
+      gain = EXTRACT16(gain32);
+#else
+      if (gain32 > 2)
+         gain32=2;
+      gain = gain32;
+#endif
+      for (i=0;i<st->frameSize;i++)
+      {
+         st->exc[i] = MULT16_16_Q14(gain, st->exc[i]);
+         out[i]=st->exc[i-st->subframeSize];
+      }
+   }
+
+   /*Loop on subframes */
+   for (sub=0;sub<st->nbSubframes;sub++)
+   {
+      int offset;
+      spx_word16_t *sp;
+      spx_word16_t *exc;
+      /* Offset relative to start of frame */
+      offset = st->subframeSize*sub;
+      /* Original signal */
+      sp=out+offset;
+      /* Excitation */
+      exc=st->exc+offset;
+
+      /* LSP interpolation (quantized and unquantized) */
+      lsp_interpolate(st->old_qlsp, qlsp, interp_qlsp, st->lpcSize, sub, st->nbSubframes);
+
+      /* Make sure the LSP's are stable */
+      lsp_enforce_margin(interp_qlsp, st->lpcSize, LSP_MARGIN);
+
+      /* Compute interpolated LPCs (unquantized) */
+      lsp_to_lpc(interp_qlsp, ak, st->lpcSize, stack);
+
+      /* Compute analysis filter at w=pi */
+      {
+         spx_word32_t pi_g=LPC_SCALING;
+         for (i=0;i<st->lpcSize;i+=2)
+         {
+            /*pi_g += -st->interp_qlpc[i] +  st->interp_qlpc[i+1];*/
+            pi_g = ADD32(pi_g, SUB32(EXTEND32(st->interp_qlpc[i+1]),EXTEND32(st->interp_qlpc[i])));
+         }
+         st->pi_gain[sub] = pi_g;
+      }
+      
+      iir_mem16(sp, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, 
+                st->mem_sp, stack);
+      
+      for (i=0;i<st->lpcSize;i++)
+         st->interp_qlpc[i] = ak[i];
+
    }
 
    /*for (i=0;i<st->frameSize;i++)
@@ -1783,7 +1739,7 @@
 
    /* Store the LSPs for interpolation in the next frame */
    for (i=0;i<st->lpcSize;i++)
-      st->old_qlsp[i] = st->qlsp[i];
+      st->old_qlsp[i] = qlsp[i];
 
    /* The next frame will not be the first (Duh!) */
    st->first = 0;
@@ -1839,12 +1795,14 @@
       (*(int*)ptr) = st->dtx_enabled;
       break;
    case SPEEX_SET_ABR:
-      st->abr_enabled = (*(int*)ptr);
-      st->vbr_enabled = 1;
+      st->abr_enabled = (*(spx_int32_t*)ptr);
+      st->vbr_enabled = st->abr_enabled!=0;
+      if (st->vbr_enabled) 
       {
-         int i=10, rate, target;
+         int i=10;
+         spx_int32_t rate, target;
          float vbr_qual;
-         target = (*(int*)ptr);
+         target = (*(spx_int32_t*)ptr);
          while (i>=0)
          {
             speex_encoder_ctl(st, SPEEX_SET_QUALITY, &i);
@@ -1864,7 +1822,7 @@
       
       break;
    case SPEEX_GET_ABR:
-      (*(int*)ptr) = st->abr_enabled;
+      (*(spx_int32_t*)ptr) = st->abr_enabled;
       break;
    case SPEEX_SET_VBR_QUALITY:
       st->vbr_quality = (*(float*)ptr);
@@ -1888,12 +1846,13 @@
          st->complexity=0;
       break;
    case SPEEX_GET_COMPLEXITY:
-      (*(int*)ptr) = st->complexity;
+      (*(spx_int32_t*)ptr) = st->complexity;
       break;
    case SPEEX_SET_BITRATE:
       {
-         int i=10, rate, target;
-         target = (*(int*)ptr);
+         int i=10;
+         spx_int32_t rate, target;
+         target = (*(spx_int32_t*)ptr);
          while (i>=0)
          {
             speex_encoder_ctl(st, SPEEX_SET_QUALITY, &i);
@@ -1906,15 +1865,15 @@
       break;
    case SPEEX_GET_BITRATE:
       if (st->submodes[st->submodeID])
-         (*(int*)ptr) = st->sampling_rate*SUBMODE(bits_per_frame)/st->frameSize;
+         (*(spx_int32_t*)ptr) = st->sampling_rate*SUBMODE(bits_per_frame)/st->frameSize;
       else
-         (*(int*)ptr) = st->sampling_rate*(NB_SUBMODE_BITS+1)/st->frameSize;
+         (*(spx_int32_t*)ptr) = st->sampling_rate*(NB_SUBMODE_BITS+1)/st->frameSize;
       break;
    case SPEEX_SET_SAMPLING_RATE:
-      st->sampling_rate = (*(int*)ptr);
+      st->sampling_rate = (*(spx_int32_t*)ptr);
       break;
    case SPEEX_GET_SAMPLING_RATE:
-      (*(int*)ptr)=st->sampling_rate;
+      (*(spx_int32_t*)ptr)=st->sampling_rate;
       break;
    case SPEEX_RESET_STATE:
       {
@@ -1922,13 +1881,13 @@
          st->bounded_pitch = 1;
          st->first = 1;
          for (i=0;i<st->lpcSize;i++)
-            st->lsp[i]=(M_PI*((float)(i+1)))/(st->lpcSize+1);
+            st->old_lsp[i]=(M_PI*((float)(i+1)))/(st->lpcSize+1);
          for (i=0;i<st->lpcSize;i++)
             st->mem_sw[i]=st->mem_sw_whole[i]=st->mem_sp[i]=st->mem_exc[i]=0;
          for (i=0;i<st->frameSize+st->max_pitch+1;i++)
             st->excBuf[i]=st->swBuf[i]=0;
-         for (i=0;i<st->windowSize;i++)
-            st->inBuf[i]=0;
+         for (i=0;i<st->windowSize-st->frameSize;i++)
+            st->winBuf[i]=0;
       }
       break;
    case SPEEX_SET_SUBMODE_ENCODING:
@@ -1948,6 +1907,15 @@
    case SPEEX_GET_PLC_TUNING:
       (*(int*)ptr)=(st->plc_tuning);
       break;
+   case SPEEX_SET_VBR_MAX_BITRATE:
+      st->vbr_max = (*(spx_int32_t*)ptr);
+      break;
+   case SPEEX_GET_VBR_MAX_BITRATE:
+      (*(spx_int32_t*)ptr) = st->vbr_max;
+      break;
+
+
+   /* This is all internal stuff past this point */
    case SPEEX_GET_PI_GAIN:
       {
          int i;
@@ -1959,22 +1927,17 @@
    case SPEEX_GET_EXC:
       {
          int i;
-         spx_sig_t *e = (spx_sig_t*)ptr;
+         spx_word16_t *e = (spx_word16_t*)ptr;
          for (i=0;i<st->frameSize;i++)
             e[i]=st->exc[i];
       }
       break;
-   case SPEEX_GET_INNOV:
-      {
-         int i;
-         spx_sig_t *e = (spx_sig_t*)ptr;
-         for (i=0;i<st->frameSize;i++)
-            e[i]=st->innov[i];
-      }
-      break;
    case SPEEX_GET_RELATIVE_QUALITY:
       (*(float*)ptr)=st->relative_quality;
       break;
+   case SPEEX_SET_INNOVATION_SAVE:
+      st->innov_save = ptr;
+      break;
    default:
       speex_warning_int("Unknown nb_ctl request: ", request);
       return -1;
@@ -2007,15 +1970,15 @@
       break;
    case SPEEX_GET_BITRATE:
       if (st->submodes[st->submodeID])
-         (*(int*)ptr) = st->sampling_rate*SUBMODE(bits_per_frame)/st->frameSize;
+         (*(spx_int32_t*)ptr) = st->sampling_rate*SUBMODE(bits_per_frame)/st->frameSize;
       else
-         (*(int*)ptr) = st->sampling_rate*(NB_SUBMODE_BITS+1)/st->frameSize;
+         (*(spx_int32_t*)ptr) = st->sampling_rate*(NB_SUBMODE_BITS+1)/st->frameSize;
       break;
    case SPEEX_SET_SAMPLING_RATE:
-      st->sampling_rate = (*(int*)ptr);
+      st->sampling_rate = (*(spx_int32_t*)ptr);
       break;
    case SPEEX_GET_SAMPLING_RATE:
-      (*(int*)ptr)=st->sampling_rate;
+      (*(spx_int32_t*)ptr)=st->sampling_rate;
       break;
    case SPEEX_SET_HANDLER:
       {
@@ -2036,12 +1999,10 @@
    case SPEEX_RESET_STATE:
       {
          int i;
-         for (i=0;i<2*st->lpcSize;i++)
+         for (i=0;i<st->lpcSize;i++)
             st->mem_sp[i]=0;
          for (i=0;i<st->frameSize + st->max_pitch + 1;i++)
             st->excBuf[i]=0;
-         for (i=0;i<st->frameSize;i++)
-            st->inBuf[i] = 0;
       }
       break;
    case SPEEX_SET_SUBMODE_ENCODING:
@@ -2050,6 +2011,9 @@
    case SPEEX_GET_SUBMODE_ENCODING:
       (*(int*)ptr) = st->encode_submode;
       break;
+   case SPEEX_GET_LOOKAHEAD:
+      (*(int*)ptr)=st->subframeSize;
+      break;
    case SPEEX_GET_PI_GAIN:
       {
          int i;
@@ -2061,22 +2025,17 @@
    case SPEEX_GET_EXC:
       {
          int i;
-         spx_sig_t *e = (spx_sig_t*)ptr;
+         spx_word16_t *e = (spx_word16_t*)ptr;
          for (i=0;i<st->frameSize;i++)
             e[i]=st->exc[i];
       }
       break;
-   case SPEEX_GET_INNOV:
-      {
-         int i;
-         spx_sig_t *e = (spx_sig_t*)ptr;
-         for (i=0;i<st->frameSize;i++)
-            e[i]=st->innov[i];
-      }
-      break;
    case SPEEX_GET_DTX_STATUS:
       *((int*)ptr) = st->dtx_enabled;
       break;
+   case SPEEX_SET_INNOVATION_SAVE:
+      st->innov_save = ptr;
+      break;
    default:
       speex_warning_int("Unknown nb_ctl request: ", request);
       return -1;
diff --git a/pjmedia/src/pjmedia-codec/speex/nb_celp.h b/pjmedia/src/pjmedia-codec/speex/nb_celp.h
index c9fb2b3..92028cb 100644
--- a/pjmedia/src/pjmedia-codec/speex/nb_celp.h
+++ b/pjmedia/src/pjmedia-codec/speex/nb_celp.h
@@ -48,20 +48,20 @@
 
 /**Structure representing the full state of the narrowband encoder*/
 typedef struct EncState {
-   const SpeexMode *mode;       /**< Mode corresponding to the state */
-   int    first;          /**< Is this the first frame? */
-   int    frameSize;      /**< Size of frames */
-   int    subframeSize;   /**< Size of sub-frames */
-   int    nbSubframes;    /**< Number of sub-frames */
-   int    windowSize;     /**< Analysis (LPC) window length */
-   int    lpcSize;        /**< LPC order */
-   int    min_pitch;      /**< Minimum pitch value allowed */
-   int    max_pitch;      /**< Maximum pitch value allowed */
+   const SpeexMode *mode;        /**< Mode corresponding to the state */
+   int    first;                 /**< Is this the first frame? */
+   int    frameSize;             /**< Size of frames */
+   int    subframeSize;          /**< Size of sub-frames */
+   int    nbSubframes;           /**< Number of sub-frames */
+   int    windowSize;            /**< Analysis (LPC) window length */
+   int    lpcSize;               /**< LPC order */
+   int    min_pitch;             /**< Minimum pitch value allowed */
+   int    max_pitch;             /**< Maximum pitch value allowed */
 
-   int    safe_pitch;     /**< Don't use too large values for pitch (in case we lose a packet) */
-   int    bounded_pitch;  /**< Next frame should not rely on previous frames for pitch */
-   int    ol_pitch;       /**< Open-loop pitch */
-   int    ol_voiced;      /**< Open-loop voiced/non-voiced decision */
+   spx_word32_t cumul_gain;      /**< Product of previously used pitch gains (Q10) */
+   int    bounded_pitch;         /**< Next frame should not rely on previous frames for pitch */
+   int    ol_pitch;              /**< Open-loop pitch */
+   int    ol_voiced;             /**< Open-loop voiced/non-voiced decision */
    int   *pitch;
 
 #ifdef EPIC_48K
@@ -70,111 +70,100 @@
 
 #ifdef VORBIS_PSYCHO
    VorbisPsy *psy;
+   float *psy_window;
    float *curve;
    float *old_curve;
 #endif
 
    spx_word16_t  gamma1;         /**< Perceptual filter: A(z/gamma1) */
    spx_word16_t  gamma2;         /**< Perceptual filter: A(z/gamma2) */
-   float  lag_factor;     /**< Lag windowing Gaussian width */
+   float  lag_factor;            /**< Lag windowing Gaussian width */
    spx_word16_t  lpc_floor;      /**< Noise floor multiplier for A[0] in LPC analysis*/
-   char  *stack;          /**< Pseudo-stack allocation for temporary memory */
-   spx_sig_t *inBuf;          /**< Input buffer (original signal) */
-   spx_sig_t *frame;          /**< Start of original frame */
-   spx_sig_t *excBuf;         /**< Excitation buffer */
-   spx_sig_t *exc;            /**< Start of excitation frame */
-   spx_sig_t *swBuf;          /**< Weighted signal buffer */
-   spx_sig_t *sw;             /**< Start of weighted signal frame */
-   spx_sig_t *innov;          /**< Innovation for the frame */
-   spx_word16_t *window;         /**< Temporary (Hanning) window */
-   spx_word16_t *autocorr;       /**< auto-correlation */
+   char  *stack;                 /**< Pseudo-stack allocation for temporary memory */
+   spx_word16_t *winBuf;         /**< Input buffer (original signal) */
+   spx_word16_t *excBuf;         /**< Excitation buffer */
+   spx_word16_t *exc;            /**< Start of excitation frame */
+   spx_word16_t *swBuf;          /**< Weighted signal buffer */
+   spx_word16_t *sw;             /**< Start of weighted signal frame */
+   const spx_word16_t *window;   /**< Temporary (Hanning) window */
    spx_word16_t *lagWindow;      /**< Window applied to auto-correlation */
-   spx_coef_t *lpc;            /**< LPCs for current frame */
-   spx_lsp_t *lsp;            /**< LSPs for current frame */
-   spx_lsp_t *qlsp;           /**< Quantized LSPs for current frame */
-   spx_lsp_t *old_lsp;        /**< LSPs for previous frame */
-   spx_lsp_t *old_qlsp;       /**< Quantized LSPs for previous frame */
-   spx_lsp_t *interp_lsp;     /**< Interpolated LSPs */
-   spx_lsp_t *interp_qlsp;    /**< Interpolated quantized LSPs */
-   spx_coef_t *interp_lpc;     /**< Interpolated LPCs */
-   spx_coef_t *interp_qlpc;    /**< Interpolated quantized LPCs */
-   spx_coef_t *bw_lpc1;        /**< LPCs after bandwidth expansion by gamma1 for perceptual weighting*/
-   spx_coef_t *bw_lpc2;        /**< LPCs after bandwidth expansion by gamma2 for perceptual weighting*/
-   spx_mem_t *mem_sp;         /**< Filter memory for signal synthesis */
-   spx_mem_t *mem_sw;         /**< Filter memory for perceptually-weighted signal */
-   spx_mem_t *mem_sw_whole;   /**< Filter memory for perceptually-weighted signal (whole frame)*/
-   spx_mem_t *mem_exc;        /**< Filter memory for excitation (whole frame) */
+   spx_lsp_t *old_lsp;           /**< LSPs for previous frame */
+   spx_lsp_t *old_qlsp;          /**< Quantized LSPs for previous frame */
+   spx_mem_t *mem_sp;            /**< Filter memory for signal synthesis */
+   spx_mem_t *mem_sw;            /**< Filter memory for perceptually-weighted signal */
+   spx_mem_t *mem_sw_whole;      /**< Filter memory for perceptually-weighted signal (whole frame)*/
+   spx_mem_t *mem_exc;           /**< Filter memory for excitation (whole frame) */
+   spx_mem_t *mem_exc2;          /**< Filter memory for excitation (whole frame) */
    spx_word32_t *pi_gain;        /**< Gain of LPC filter at theta=pi (fe/2) */
-
-   VBRState *vbr;         /**< State of the VBR data */
-   float  vbr_quality;    /**< Quality setting for VBR encoding */
-   float  relative_quality; /**< Relative quality that will be needed by VBR */
-   int    vbr_enabled;    /**< 1 for enabling VBR, 0 otherwise */
-   int    vad_enabled;    /**< 1 for enabling VAD, 0 otherwise */
-   int    dtx_enabled;    /**< 1 for enabling DTX, 0 otherwise */
-   int    dtx_count;      /**< Number of consecutive DTX frames */
-   int    abr_enabled;    /**< ABR setting (in bps), 0 if off */
+   spx_sig_t *innov_save;        /**< If non-NULL, innovation is copied here */
+         
+   VBRState *vbr;                /**< State of the VBR data */
+   float  vbr_quality;           /**< Quality setting for VBR encoding */
+   float  relative_quality;      /**< Relative quality that will be needed by VBR */
+   int    vbr_enabled;           /**< 1 for enabling VBR, 0 otherwise */
+   spx_int32_t vbr_max;          /**< Max bit-rate allowed in VBR mode */
+   int    vad_enabled;           /**< 1 for enabling VAD, 0 otherwise */
+   int    dtx_enabled;           /**< 1 for enabling DTX, 0 otherwise */
+   int    dtx_count;             /**< Number of consecutive DTX frames */
+   spx_int32_t abr_enabled;      /**< ABR setting (in bps), 0 if off */
    float  abr_drift;
    float  abr_drift2;
    float  abr_count;
-   int    complexity;     /**< Complexity setting (0-10 from least complex to most complex) */
-   int    sampling_rate;
+   int    complexity;            /**< Complexity setting (0-10 from least complex to most complex) */
+   spx_int32_t sampling_rate;
    int    plc_tuning;
    int    encode_submode;
    const SpeexSubmode * const *submodes; /**< Sub-mode data */
-   int    submodeID;      /**< Activated sub-mode */
-   int    submodeSelect;  /**< Mode chosen by the user (may differ from submodeID if VAD is on) */
+   int    submodeID;             /**< Activated sub-mode */
+   int    submodeSelect;         /**< Mode chosen by the user (may differ from submodeID if VAD is on) */
 } EncState;
 
 /**Structure representing the full state of the narrowband decoder*/
 typedef struct DecState {
    const SpeexMode *mode;       /**< Mode corresponding to the state */
-   int    first;          /**< Is this the first frame? */
-   int    count_lost;     /**< Was the last frame lost? */
-   int    frameSize;      /**< Size of frames */
-   int    subframeSize;   /**< Size of sub-frames */
-   int    nbSubframes;    /**< Number of sub-frames */
-   int    lpcSize;        /**< LPC order */
-   int    min_pitch;      /**< Minimum pitch value allowed */
-   int    max_pitch;      /**< Maximum pitch value allowed */
-   int    sampling_rate;
+   int    first;                /**< Is this the first frame? */
+   int    count_lost;           /**< Was the last frame lost? */
+   int    frameSize;            /**< Size of frames */
+   int    subframeSize;         /**< Size of sub-frames */
+   int    nbSubframes;          /**< Number of sub-frames */
+   int    lpcSize;              /**< LPC order */
+   int    min_pitch;            /**< Minimum pitch value allowed */
+   int    max_pitch;            /**< Maximum pitch value allowed */
+   spx_int32_t sampling_rate;
 
 #ifdef EPIC_48K
    int    lbr_48k;
 #endif
 
-   spx_word16_t  last_ol_gain;   /**< Open-loop gain for previous frame */
+   spx_word16_t  last_ol_gain;  /**< Open-loop gain for previous frame */
 
-   char  *stack;          /**< Pseudo-stack allocation for temporary memory */
-   spx_sig_t *inBuf;          /**< Input buffer (original signal) */
-   spx_sig_t *frame;          /**< Start of original frame */
-   spx_sig_t *excBuf;         /**< Excitation buffer */
-   spx_sig_t *exc;            /**< Start of excitation frame */
-   spx_sig_t *innov;          /**< Innovation for the frame */
-   spx_lsp_t *qlsp;           /**< Quantized LSPs for current frame */
-   spx_lsp_t *old_qlsp;       /**< Quantized LSPs for previous frame */
-   spx_lsp_t *interp_qlsp;    /**< Interpolated quantized LSPs */
-   spx_coef_t *interp_qlpc;    /**< Interpolated quantized LPCs */
-   spx_mem_t *mem_sp;         /**< Filter memory for synthesis signal */
-   spx_word32_t *pi_gain;        /**< Gain of LPC filter at theta=pi (fe/2) */
-   int    last_pitch;     /**< Pitch of last correctly decoded frame */
+   char  *stack;                /**< Pseudo-stack allocation for temporary memory */
+   spx_word16_t *excBuf;        /**< Excitation buffer */
+   spx_word16_t *exc;           /**< Start of excitation frame */
+   spx_lsp_t *old_qlsp;         /**< Quantized LSPs for previous frame */
+   spx_coef_t *interp_qlpc;     /**< Interpolated quantized LPCs */
+   spx_mem_t *mem_sp;           /**< Filter memory for synthesis signal */
+   spx_word32_t *pi_gain;       /**< Gain of LPC filter at theta=pi (fe/2) */
+   spx_sig_t *innov_save;       /** If non-NULL, innovation is copied here */
+   
+   /* This is used in packet loss concealment */
+   int    last_pitch;           /**< Pitch of last correctly decoded frame */
    spx_word16_t  last_pitch_gain; /**< Pitch gain of last correctly decoded frame */
-   spx_word16_t  pitch_gain_buf[3];  /**< Pitch gain of last decoded frames */
-   int    pitch_gain_buf_idx; /**< Tail of the buffer */
-   spx_int32_t seed;          /** Seed used for random number generation */
+   spx_word16_t  pitch_gain_buf[3]; /**< Pitch gain of last decoded frames */
+   int    pitch_gain_buf_idx;   /**< Tail of the buffer */
+   spx_int32_t seed;            /** Seed used for random number generation */
    
    int    encode_submode;
    const SpeexSubmode * const *submodes; /**< Sub-mode data */
-   int    submodeID;      /**< Activated sub-mode */
-   int    lpc_enh_enabled; /**< 1 when LPC enhancer is on, 0 otherwise */
-   CombFilterMem *comb_mem;
+   int    submodeID;            /**< Activated sub-mode */
+   int    lpc_enh_enabled;      /**< 1 when LPC enhancer is on, 0 otherwise */
    SpeexCallback speex_callbacks[SPEEX_MAX_CALLBACKS];
 
    SpeexCallback user_callback;
 
    /*Vocoder data*/
-   float  voc_m1;
-   float  voc_m2;
+   spx_word16_t  voc_m1;
+   spx_word32_t  voc_m2;
    float  voc_mean;
    int    voc_offset;
 
diff --git a/pjmedia/src/pjmedia-codec/speex/pseudofloat.h b/pjmedia/src/pjmedia-codec/speex/pseudofloat.h
index 8642bf0..e85f60e 100644
--- a/pjmedia/src/pjmedia-codec/speex/pseudofloat.h
+++ b/pjmedia/src/pjmedia-codec/speex/pseudofloat.h
@@ -45,9 +45,9 @@
    spx_int16_t e;
 } spx_float_t;
 
-#define FLOAT_ZERO ((spx_float_t){0,0})
-#define FLOAT_ONE ((spx_float_t){16384,-14})
-#define FLOAT_HALF ((spx_float_t){16384,-15})
+static const spx_float_t FLOAT_ZERO = {0,0};
+static const spx_float_t FLOAT_ONE = {16384,-14};
+static const spx_float_t FLOAT_HALF = {16384,-15};
 
 #define MIN(a,b) ((a)<(b)?(a):(b))
 static inline spx_float_t PSEUDOFLOAT(spx_int32_t x)
@@ -60,7 +60,10 @@
       x = -x;
    }
    if (x==0)
-      return (spx_float_t) {0,0};
+   {
+      spx_float_t r = {0,0};
+      return r;
+   }
    while (x>32767)
    {
       x >>= 1;
@@ -74,9 +77,19 @@
       e--;
    }
    if (sign)
-      return (spx_float_t) {-x,e};
+   {
+      spx_float_t r;
+      r.m = -x;
+      r.e = e;
+      return r;
+   }
    else      
-      return (spx_float_t) {x,e};
+   {
+      spx_float_t r;
+      r.m = x;
+      r.e = e;
+      return r;
+   }
 }
 
 
@@ -87,7 +100,16 @@
       return b;
    else if (b.m==0)
       return a;
-   r = (a).e > (b).e ? (spx_float_t) {((a).m>>1) + ((b).m>>MIN(15,(a).e-(b).e+1)),(a).e+1} : (spx_float_t) {((b).m>>1) + ((a).m>>MIN(15,(b).e-(a).e+1)),(b).e+1};
+   if ((a).e > (b).e) 
+   {
+      r.m = ((a).m>>1) + ((b).m>>MIN(15,(a).e-(b).e+1));
+      r.e = (a).e+1;
+   }
+   else 
+   {
+      r.m = ((b).m>>1) + ((a).m>>MIN(15,(b).e-(a).e+1));
+      r.e = (b).e+1;
+   }
    if (r.m>0)
    {
       if (r.m<16384)
@@ -113,7 +135,16 @@
       return b;
    else if (b.m==0)
       return a;
-   r = (a).e > (b).e ? (spx_float_t) {((a).m>>1) - ((b).m>>MIN(15,(a).e-(b).e+1)),(a).e+1} : (spx_float_t) {((a).m>>MIN(15,(b).e-(a).e+1)) - ((b).m>>1) ,(b).e+1};
+   if ((a).e > (b).e)
+   {
+      r.m = ((a).m>>1) - ((b).m>>MIN(15,(a).e-(b).e+1));
+      r.e = (a).e+1;
+   }
+   else 
+   {
+      r.m = ((a).m>>MIN(15,(b).e-(a).e+1)) - ((b).m>>1);
+      r.e = (b).e+1;
+   }
    if (r.m>0)
    {
       if (r.m<16384)
@@ -152,7 +183,9 @@
 
 static inline spx_float_t FLOAT_MULT(spx_float_t a, spx_float_t b)
 {
-   spx_float_t r = (spx_float_t) {(spx_int16_t)((spx_int32_t)(a).m*(b).m>>15), (a).e+(b).e+15};
+   spx_float_t r;
+   r.m = (spx_int16_t)((spx_int32_t)(a).m*(b).m>>15);
+   r.e = (a).e+(b).e+15;
    if (r.m>0)
    {
       if (r.m<16384)
@@ -174,13 +207,16 @@
 
 static inline spx_float_t FLOAT_SHL(spx_float_t a, int b)
 {
-   return (spx_float_t) {a.m,a.e+b};
+   spx_float_t r;
+   r.m = a.m;
+   r.e = a.e+b;
+   return r;
 }
 
 static inline spx_int16_t FLOAT_EXTRACT16(spx_float_t a)
 {
    if (a.e<0)
-      return (a.m+(1<<(-a.e-1)))>>-a.e;
+      return EXTRACT16((EXTEND32(a.m)+(1<<(-a.e-1)))>>-a.e);
    else
       return a.m<<a.e;
 }
@@ -196,9 +232,12 @@
 static inline spx_float_t FLOAT_MUL32U(spx_word32_t a, spx_word32_t b)
 {
    int e=0;
+   spx_float_t r;
    /* FIXME: Handle the sign */
    if (a==0)
-      return (spx_float_t) {0,0};
+   {
+      return FLOAT_ZERO;
+   }
    while (a>32767)
    {
       a >>= 1;
@@ -219,35 +258,45 @@
       b <<= 1;
       e--;
    }
-   return (spx_float_t) {MULT16_16_Q15(a,b),e+15};
+   r.m = MULT16_16_Q15(a,b);
+   r.e = e+15;
+   return r;
 }
 
 static inline spx_float_t FLOAT_DIV32_FLOAT(spx_word32_t a, spx_float_t b)
 {
    int e=0;
+   spx_float_t r;
    /* FIXME: Handle the sign */
    if (a==0)
-      return (spx_float_t) {0,0};
-   while (a<SHL32(b.m,14))
+   {
+      return FLOAT_ZERO;
+   }
+   while (a<SHL32(EXTEND32(b.m),14))
    {
       a <<= 1;
       e--;
    }
-   while (a>=SHL32(b.m-1,15))
+   while (a>=SHL32(EXTEND32(b.m-1),15))
    {
       a >>= 1;
       e++;
    }
-   return (spx_float_t) {DIV32_16(a,b.m),e-b.e};
+   r.m = DIV32_16(a,b.m);
+   r.e = e-b.e;
+   return r;
 }
 
 
 static inline spx_float_t FLOAT_DIV32(spx_word32_t a, spx_word32_t b)
 {
    int e=0;
+   spx_float_t r;
    /* FIXME: Handle the sign */
    if (a==0)
-      return (spx_float_t) {0,0};
+   {
+      return FLOAT_ZERO;
+   }
    while (b>32767)
    {
       b >>= 1;
@@ -263,13 +312,16 @@
       a >>= 1;
       e++;
    }
-   return (spx_float_t) {DIV32_16(a,b),e};
+   r.m = DIV32_16(a,b);
+   r.e = e;
+   return r;
 }
 
 static inline spx_float_t FLOAT_DIVU(spx_float_t a, spx_float_t b)
 {
    int e=0;
    spx_int32_t num;
+   spx_float_t r;
    num = a.m;
    while (a.m >= b.m)
    {
@@ -277,7 +329,9 @@
       a.m >>= 1;
    }
    num = num << (15-e);
-   return (spx_float_t) {DIV32_16(num,b.m),a.e-b.e-15+e};
+   r.m = DIV32_16(num,b.m);
+   r.e = a.e-b.e-15+e;
+   return r;
 }
 
 #else
diff --git a/pjmedia/src/pjmedia-codec/speex/quant_lsp.c b/pjmedia/src/pjmedia-codec/speex/quant_lsp.c
index 7bd0b91..bfca587 100644
--- a/pjmedia/src/pjmedia-codec/speex/quant_lsp.c
+++ b/pjmedia/src/pjmedia-codec/speex/quant_lsp.c
@@ -40,9 +40,12 @@
 #define M_PI 3.14159265358979323846
 #endif
 
-
 #include "misc.h"
 
+#ifdef BFIN_ASM
+#include "quant_lsp_bfin.h"
+#endif
+
 #ifdef FIXED_POINT
 
 #define LSP_LINEAR(i) (SHL16(i+1,11))
@@ -90,12 +93,13 @@
 }
 
 /* Note: x is modified*/
+#ifndef OVERRIDE_LSP_QUANT
 static int lsp_quant(spx_word16_t *x, const signed char *cdbk, int nbVec, int nbDim)
 {
    int i,j;
    spx_word32_t dist;
    spx_word16_t tmp;
-   spx_word32_t best_dist=0;
+   spx_word32_t best_dist=VERY_LARGE32;
    int best_id=0;
    const signed char *ptr=cdbk;
    for (i=0;i<nbVec;i++)
@@ -105,8 +109,8 @@
       {
          tmp=SUB16(x[j],SHL16((spx_word16_t)*ptr++,5));
          dist=MAC16_16(dist,tmp,tmp);
-      }
-      if (dist<best_dist || i==0)
+      } 
+      if (dist<best_dist)
       {
          best_dist=dist;
          best_id=i;
@@ -118,14 +122,16 @@
     
    return best_id;
 }
+#endif
 
 /* Note: x is modified*/
+#ifndef OVERRIDE_LSP_WEIGHT_QUANT
 static int lsp_weight_quant(spx_word16_t *x, spx_word16_t *weight, const signed char *cdbk, int nbVec, int nbDim)
 {
    int i,j;
    spx_word32_t dist;
    spx_word16_t tmp;
-   spx_word32_t best_dist=0;
+   spx_word32_t best_dist=VERY_LARGE32;
    int best_id=0;
    const signed char *ptr=cdbk;
    for (i=0;i<nbVec;i++)
@@ -136,7 +142,7 @@
          tmp=SUB16(x[j],SHL16((spx_word16_t)*ptr++,5));
          dist=MAC16_32_Q15(dist,weight[j],MULT16_16(tmp,tmp));
       }
-      if (dist<best_dist || i==0)
+      if (dist<best_dist)
       {
          best_dist=dist;
          best_id=i;
@@ -147,7 +153,7 @@
       x[j] = SUB16(x[j],SHL16((spx_word16_t)cdbk[best_id*nbDim+j],5));
    return best_id;
 }
-
+#endif
 
 void lsp_quant_nb(spx_lsp_t *lsp, spx_lsp_t *qlsp, int order, SpeexBits *bits)
 {
diff --git a/pjmedia/src/pjmedia-codec/speex/sb_celp.c b/pjmedia/src/pjmedia-codec/speex/sb_celp.c
index 5adf44d..465a538 100644
--- a/pjmedia/src/pjmedia-codec/speex/sb_celp.c
+++ b/pjmedia/src/pjmedia-codec/speex/sb_celp.c
@@ -201,6 +201,8 @@
 };
 #endif
 
+extern const spx_word16_t lpc_window[];
+
 static void mix_and_saturate(spx_word32_t *x0, spx_word32_t *x1, spx_word16_t *out, int len)
 {
    int i;
@@ -244,7 +246,7 @@
    st->frame_size = mode->frameSize;
    st->subframeSize = mode->subframeSize;
    st->nbSubframes = mode->frameSize/mode->subframeSize;
-   st->windowSize = st->frame_size*3/2;
+   st->windowSize = st->frame_size+st->subframeSize;
    st->lpcSize=mode->lpcSize;
    st->bufSize=mode->bufSize;
 
@@ -277,18 +279,7 @@
 
    st->res=speex_alloc((st->frame_size)*sizeof(spx_sig_t));
    st->sw=speex_alloc((st->frame_size)*sizeof(spx_sig_t));
-   st->target=speex_alloc((st->frame_size)*sizeof(spx_sig_t));
-   /*Asymmetric "pseudo-Hamming" window*/
-   {
-      int part1, part2;
-      part1 = st->subframeSize*7/2;
-      part2 = st->subframeSize*5/2;
-      st->window = speex_alloc((st->windowSize)*sizeof(spx_word16_t));
-      for (i=0;i<part1;i++)
-         st->window[i]=(spx_word16_t)(SIG_SCALING*(.54-.46*cos(M_PI*i/part1)));
-      for (i=0;i<part2;i++)
-         st->window[part1+i]=(spx_word16_t)(SIG_SCALING*(.54+.46*cos(M_PI*i/part2)));
-   }
+   st->window= lpc_window;
 
    st->lagWindow = speex_alloc((st->lpcSize+1)*sizeof(spx_word16_t));
    for (i=0;i<st->lpcSize+1;i++)
@@ -307,13 +298,18 @@
    st->interp_lpc = speex_alloc(st->lpcSize*sizeof(spx_coef_t));
    st->interp_qlpc = speex_alloc(st->lpcSize*sizeof(spx_coef_t));
    st->pi_gain = speex_alloc((st->nbSubframes)*sizeof(spx_word32_t));
-
+   st->low_innov = speex_alloc((st->frame_size)*sizeof(spx_word32_t));
+   speex_encoder_ctl(st->st_low, SPEEX_SET_INNOVATION_SAVE, st->low_innov);
+   st->innov_save = NULL;
+   
    st->mem_sp = speex_alloc((st->lpcSize)*sizeof(spx_mem_t));
    st->mem_sp2 = speex_alloc((st->lpcSize)*sizeof(spx_mem_t));
    st->mem_sw = speex_alloc((st->lpcSize)*sizeof(spx_mem_t));
 
    st->vbr_quality = 8;
    st->vbr_enabled = 0;
+   st->vbr_max = 0;
+   st->vbr_max_high = 20000;  /* We just need a big value here */
    st->vad_enabled = 0;
    st->abr_enabled = 0;
    st->relative_quality=0;
@@ -350,8 +346,6 @@
    speex_free(st->excBuf);
    speex_free(st->res);
    speex_free(st->sw);
-   speex_free(st->target);
-   speex_free(st->window);
    speex_free(st->lagWindow);
 
    speex_free(st->autocorr);
@@ -384,10 +378,10 @@
    char *stack;
    VARDECL(spx_mem_t *mem);
    VARDECL(spx_sig_t *innov);
+   VARDECL(spx_word16_t *target);
    VARDECL(spx_word16_t *syn_resp);
    VARDECL(spx_word32_t *low_pi_gain);
-   VARDECL(spx_sig_t *low_exc);
-   VARDECL(spx_sig_t *low_innov);
+   VARDECL(spx_word16_t *low_exc);
    const SpeexSBMode *mode;
    int dtx;
    spx_word16_t *in = vin;
@@ -422,11 +416,9 @@
 
 
    ALLOC(low_pi_gain, st->nbSubframes, spx_word32_t);
-   ALLOC(low_exc, st->frame_size, spx_sig_t);
-   ALLOC(low_innov, st->frame_size, spx_sig_t);
+   ALLOC(low_exc, st->frame_size, spx_word16_t);
    speex_encoder_ctl(st->st_low, SPEEX_GET_PI_GAIN, low_pi_gain);
    speex_encoder_ctl(st->st_low, SPEEX_GET_EXC, low_exc);
-   speex_encoder_ctl(st->st_low, SPEEX_GET_INNOV, low_innov);
    
    speex_encoder_ctl(st->st_low, SPEEX_GET_LOW_MODE, &dtx);
 
@@ -455,15 +447,15 @@
    _spx_lpc(st->lpc, st->autocorr, st->lpcSize);
 
    /* LPC to LSPs (x-domain) transform */
-   roots=lpc_to_lsp (st->lpc, st->lpcSize, st->lsp, 15, LSP_DELTA1, stack);
+   roots=lpc_to_lsp (st->lpc, st->lpcSize, st->lsp, 10, LSP_DELTA1, stack);
    if (roots!=st->lpcSize)
    {
-      roots = lpc_to_lsp (st->lpc, st->lpcSize, st->lsp, 11, LSP_DELTA2, stack);
+      roots = lpc_to_lsp (st->lpc, st->lpcSize, st->lsp, 10, LSP_DELTA2, stack);
       if (roots!=st->lpcSize) {
          /*If we can't find all LSP's, do some damage control and use a flat filter*/
          for (i=0;i<st->lpcSize;i++)
          {
-            st->lsp[i]=M_PI*((float)(i+1))/(st->lpcSize+1);
+            st->lsp[i]=LSP_SCALING*M_PI*((float)(i+1))/(st->lpcSize+1);
          }
       }
    }
@@ -521,7 +513,7 @@
             else
                thresh = (st->vbr_quality-v1)   * mode->vbr_thresh[modeid][v1+1] + 
                         (1+v1-st->vbr_quality) * mode->vbr_thresh[modeid][v1];
-            if (st->relative_quality >= thresh)
+            if (st->relative_quality >= thresh && st->sampling_rate*st->submodes[modeid]->bits_per_frame/st->full_frame_size <= st->vbr_max_high)
                break;
             modeid--;
          }
@@ -601,10 +593,11 @@
    ALLOC(mem, st->lpcSize, spx_mem_t);
    ALLOC(syn_resp, st->subframeSize, spx_word16_t);
    ALLOC(innov, st->subframeSize, spx_sig_t);
+   ALLOC(target, st->subframeSize, spx_word16_t);
 
    for (sub=0;sub<st->nbSubframes;sub++)
    {
-      spx_sig_t *exc, *sp, *res, *target, *sw;
+      spx_sig_t *exc, *sp, *res, *sw, *innov_save=NULL;
       spx_word16_t filter_ratio;
       int offset;
       spx_word32_t rl, rh;
@@ -614,8 +607,14 @@
       sp=st->high+offset;
       exc=st->exc+offset;
       res=st->res+offset;
-      target=st->target+offset;
       sw=st->sw+offset;
+      /* Pointer for saving innovation */
+      if (st->innov_save)
+      {
+         innov_save = st->innov_save+2*offset;
+         for (i=0;i<2*st->subframeSize;i++)
+            innov_save[i]=0;
+      }
       
       /* LSP interpolation (quantized and unquantized) */
       lsp_interpolate(st->old_lsp, st->lsp, st->interp_lsp, st->lpcSize, sub, st->nbSubframes);
@@ -642,7 +641,7 @@
       
       rl = low_pi_gain[sub];
 #ifdef FIXED_POINT
-      filter_ratio=DIV32_16(SHL(rl+82,2),SHR(82+rh,5));
+      filter_ratio=PDIV32_16(SHL(rl+82,2),SHR(82+rh,5));
 #else
       filter_ratio=(rl+.01)/(rh+.01);
 #endif
@@ -656,10 +655,10 @@
       if (!SUBMODE(innovation_quant)) {/* 1 for spectral folding excitation, 0 for stochastic */
          float g;
          spx_word16_t el;
-         el = compute_rms(low_innov+offset, st->subframeSize);
+         el = compute_rms(st->low_innov+offset, st->subframeSize);
 
          /* Gain to use if we want to use the low-band excitation for high-band */
-         g=eh/(.01+el);
+         g=eh/(1.+el);
          
 #if 0
          {
@@ -669,7 +668,7 @@
             ALLOC(tmp_sig, st->subframeSize, spx_sig_t);
             for (i=0;i<st->lpcSize;i++)
                mem[i]=st->mem_sp[i];
-            iir_mem2(low_innov+offset, st->interp_qlpc, tmp_sig, st->subframeSize, st->lpcSize, mem);
+            iir_mem2(st->low_innov+offset, st->interp_qlpc, tmp_sig, st->subframeSize, st->lpcSize, mem);
             g2 = compute_rms(sp, st->subframeSize)/(.01+compute_rms(tmp_sig, st->subframeSize));
             /*fprintf (stderr, "gains: %f %f\n", g, g2);*/
             g = g2;
@@ -698,9 +697,9 @@
          spx_word16_t gc;
          spx_word32_t scale;
          spx_word16_t el;
-         el = compute_rms(low_exc+offset, st->subframeSize);
+         el = compute_rms16(low_exc+offset, st->subframeSize);
 
-         gc = DIV32_16(MULT16_16(filter_ratio,1+eh),1+el);
+         gc = PDIV32_16(MULT16_16(filter_ratio,1+eh),1+el);
 
          /* This is a kludge that cleans up a historical bug */
          if (st->subframeSize==80)
@@ -726,7 +725,7 @@
          if (st->subframeSize==80)
             gc *= 1.4142;
 
-         scale = SHL(MULT16_16(DIV32_16(SHL(gc,SIG_SHIFT-4),filter_ratio),(1+el)),4);
+         scale = SHL32(MULT16_16(PDIV32_16(SHL32(EXTEND32(gc),SIG_SHIFT-6),filter_ratio),(1+el)),6);
 
          compute_impulse_response(st->interp_qlpc, st->bw_lpc1, st->bw_lpc2, syn_resp, st->subframeSize, st->lpcSize, stack);
 
@@ -751,7 +750,7 @@
 
          /* Compute target signal */
          for (i=0;i<st->subframeSize;i++)
-            target[i]=sw[i]-res[i];
+            target[i]=PSHR32(sw[i]-res[i],SIG_SHIFT);
 
          for (i=0;i<st->subframeSize;i++)
            exc[i]=0;
@@ -773,6 +772,12 @@
          for (i=0;i<st->subframeSize;i++)
             exc[i] = ADD32(exc[i], innov[i]);
 
+         if (st->innov_save)
+         {
+            for (i=0;i<st->subframeSize;i++)
+               innov_save[2*i]=innov[i];
+         }
+         
          if (SUBMODE(double_codebook)) {
             char *tmp_stack=stack;
             VARDECL(spx_sig_t *innov2);
@@ -871,6 +876,7 @@
    st->g1_mem=speex_alloc((QMF_ORDER)*sizeof(spx_word32_t));
 
    st->exc=speex_alloc((st->frame_size)*sizeof(spx_sig_t));
+   st->excBuf=speex_alloc((st->subframeSize)*sizeof(spx_sig_t));
 
    st->qlsp = speex_alloc((st->lpcSize)*sizeof(spx_lsp_t));
    st->old_qlsp = speex_alloc((st->lpcSize)*sizeof(spx_lsp_t));
@@ -880,6 +886,11 @@
    st->pi_gain = speex_alloc((st->nbSubframes)*sizeof(spx_word32_t));
    st->mem_sp = speex_alloc((2*st->lpcSize)*sizeof(spx_mem_t));
    
+   st->low_innov = speex_alloc((st->frame_size)*sizeof(spx_word32_t));
+   speex_decoder_ctl(st->st_low, SPEEX_SET_INNOVATION_SAVE, st->low_innov);
+   st->innov_save = NULL;
+
+
    st->lpc_enh_enabled=0;
    st->seed = 1000;
 
@@ -906,6 +917,7 @@
    speex_free(st->g0_mem);
    speex_free(st->g1_mem);
    speex_free(st->exc);
+   speex_free(st->excBuf);
    speex_free(st->qlsp);
    speex_free(st->old_qlsp);
    speex_free(st->interp_qlsp);
@@ -919,9 +931,6 @@
 static void sb_decode_lost(SBDecState *st, spx_word16_t *out, int dtx, char *stack)
 {
    int i;
-   VARDECL(spx_coef_t *awk1);
-   VARDECL(spx_coef_t *awk2);
-   VARDECL(spx_coef_t *awk3);
    int saved_modeid=0;
 
    if (dtx)
@@ -934,28 +943,6 @@
 
    st->first=1;
    
-   ALLOC(awk1, st->lpcSize+1, spx_coef_t);
-   ALLOC(awk2, st->lpcSize+1, spx_coef_t);
-   ALLOC(awk3, st->lpcSize+1, spx_coef_t);
-   
-   if (st->lpc_enh_enabled)
-   {
-      spx_word16_t k1,k2,k3;
-      if (st->submodes[st->submodeID] != NULL)
-      {
-         k1=SUBMODE(lpc_enh_k1);
-         k2=SUBMODE(lpc_enh_k2);
-         k3=SUBMODE(lpc_enh_k3);
-      } else {
-         k1=k2=.7*GAMMA_SCALING;
-         k3 = 0;
-      }
-      bw_lpc(k1, st->interp_qlpc, awk1, st->lpcSize);
-      bw_lpc(k2, st->interp_qlpc, awk2, st->lpcSize);
-      bw_lpc(k3, st->interp_qlpc, awk3, st->lpcSize);
-      /*fprintf (stderr, "%f %f %f\n", k1, k2, k3);*/
-   }
-   
    
    /* Final signal synthesis from excitation */
    if (!dtx)
@@ -969,22 +956,9 @@
    for (i=0;i<st->frame_size;i++)
       st->high[i]=st->exc[i];
 
-   if (st->lpc_enh_enabled)
-   {
-      /* Use enhanced LPC filter */
-      filter_mem2(st->high, awk2, awk1, st->high, st->frame_size, st->lpcSize, 
-                  st->mem_sp+st->lpcSize);
-      filter_mem2(st->high, awk3, st->interp_qlpc, st->high, st->frame_size, st->lpcSize, 
-                  st->mem_sp);
-   } else {
-      /* Use regular filter */
-      for (i=0;i<st->lpcSize;i++)
-         st->mem_sp[st->lpcSize+i] = 0;
-      iir_mem2(st->high, st->interp_qlpc, st->high, st->frame_size, st->lpcSize, 
-               st->mem_sp);
-   }
+   iir_mem2(st->high, st->interp_qlpc, st->high, st->frame_size, st->lpcSize, 
+            st->mem_sp);
    
-   /*iir_mem2(st->exc, st->interp_qlpc, st->high, st->frame_size, st->lpcSize, st->mem_sp);*/
    
    /* Reconstruct the original */
    fir_mem_up(st->x0d, h0, st->y0, st->full_frame_size, QMF_ORDER, st->g0_mem, stack);
@@ -1008,11 +982,8 @@
    int ret;
    char *stack;
    VARDECL(spx_word32_t *low_pi_gain);
-   VARDECL(spx_sig_t *low_exc);
-   VARDECL(spx_sig_t *low_innov);
-   VARDECL(spx_coef_t *awk1);
-   VARDECL(spx_coef_t *awk2);
-   VARDECL(spx_coef_t *awk3);
+   VARDECL(spx_word16_t *low_exc);
+   VARDECL(spx_coef_t *ak);
    int dtx;
    const SpeexSBMode *mode;
    spx_word16_t *out = vout;
@@ -1101,11 +1072,9 @@
       st->exc[i]=0;
 
    ALLOC(low_pi_gain, st->nbSubframes, spx_word32_t);
-   ALLOC(low_exc, st->frame_size, spx_sig_t);
-   ALLOC(low_innov, st->frame_size, spx_sig_t);
+   ALLOC(low_exc, st->frame_size, spx_word16_t);
    speex_decoder_ctl(st->st_low, SPEEX_GET_PI_GAIN, low_pi_gain);
    speex_decoder_ctl(st->st_low, SPEEX_GET_EXC, low_exc);
-   speex_decoder_ctl(st->st_low, SPEEX_GET_INNOV, low_innov);
 
    SUBMODE(lsp_unquant)(st->qlsp, st->lpcSize, bits);
    
@@ -1115,13 +1084,11 @@
          st->old_qlsp[i] = st->qlsp[i];
    }
    
-   ALLOC(awk1, st->lpcSize+1, spx_coef_t);
-   ALLOC(awk2, st->lpcSize+1, spx_coef_t);
-   ALLOC(awk3, st->lpcSize+1, spx_coef_t);
+   ALLOC(ak, st->lpcSize, spx_coef_t);
 
    for (sub=0;sub<st->nbSubframes;sub++)
    {
-      spx_sig_t *exc, *sp;
+      spx_sig_t *exc, *sp, *innov_save=NULL;
       spx_word16_t filter_ratio;
       spx_word16_t el=0;
       int offset;
@@ -1130,6 +1097,13 @@
       offset = st->subframeSize*sub;
       sp=st->high+offset;
       exc=st->exc+offset;
+      /* Pointer for saving innovation */
+      if (st->innov_save)
+      {
+         innov_save = st->innov_save+2*offset;
+         for (i=0;i<2*st->subframeSize;i++)
+            innov_save[i]=0;
+      }
       
       /* LSP interpolation */
       lsp_interpolate(st->old_qlsp, st->qlsp, st->interp_qlsp, st->lpcSize, sub, st->nbSubframes);
@@ -1137,21 +1111,7 @@
       lsp_enforce_margin(st->interp_qlsp, st->lpcSize, LSP_MARGIN);
 
       /* LSP to LPC */
-      lsp_to_lpc(st->interp_qlsp, st->interp_qlpc, st->lpcSize, stack);
-
-
-      if (st->lpc_enh_enabled)
-      {
-         spx_word16_t k1,k2,k3;
-         k1=SUBMODE(lpc_enh_k1);
-         k2=SUBMODE(lpc_enh_k2);
-         k3=SUBMODE(lpc_enh_k3);
-         bw_lpc(k1, st->interp_qlpc, awk1, st->lpcSize);
-         bw_lpc(k2, st->interp_qlpc, awk2, st->lpcSize);
-         bw_lpc(k3, st->interp_qlpc, awk3, st->lpcSize);
-         /*fprintf (stderr, "%f %f %f\n", k1, k2, k3);*/
-      }
-
+      lsp_to_lpc(st->interp_qlsp, ak, st->lpcSize, stack);
 
       /* Calculate reponse ratio between the low and high filter in the middle
          of the band (4000 Hz) */
@@ -1166,7 +1126,7 @@
 
          rl = low_pi_gain[sub];
 #ifdef FIXED_POINT
-         filter_ratio=DIV32_16(SHL(rl+82,2),SHR(82+rh,5));
+         filter_ratio=PDIV32_16(SHL(rl+82,2),SHR(82+rh,5));
 #else
          filter_ratio=(rl+.01)/(rh+.01);
 #endif
@@ -1190,7 +1150,7 @@
          
 #if 0
          for (i=0;i<st->subframeSize;i++)
-            exc[i]=mode->folding_gain*g*low_innov[offset+i];
+            exc[i]=mode->folding_gain*g*st->low_innov[offset+i];
 #else
          {
             float tmp=1;
@@ -1199,7 +1159,7 @@
             el = compute_rms(low_innov+offset, st->subframeSize);*/
             for (i=0;i<st->subframeSize;i++)
             {
-               float e=tmp*g*mode->folding_gain*low_innov[offset+i];
+               float e=tmp*g*mode->folding_gain*st->low_innov[offset+i];
                tmp *= -1;
                exc[i] = e;
                /*float r = speex_rand(g*el,&seed);
@@ -1210,14 +1170,13 @@
             
          }
          
-         /*speex_rand_vec(mode->folding_gain*g*el, exc, st->subframeSize);*/
 #endif    
       } else {
          spx_word16_t gc;
          spx_word32_t scale;
          int qgc = speex_bits_unpack_unsigned(bits, 4);
 
-         el = compute_rms(low_exc+offset, st->subframeSize);
+         el = compute_rms16(low_exc+offset, st->subframeSize);
 
 #ifdef FIXED_POINT
          gc = MULT16_32_Q15(28626,gc_quant_bound[qgc]);
@@ -1228,7 +1187,7 @@
          if (st->subframeSize==80)
             gc *= 1.4142;
 
-         scale = SHL(MULT16_16(DIV32_16(SHL(gc,SIG_SHIFT-4),filter_ratio),(1+el)),4);
+         scale = SHL(MULT16_16(PDIV32_16(SHL(gc,SIG_SHIFT-6),filter_ratio),(1+el)),6);
 
          SUBMODE(innovation_unquant)(exc, SUBMODE(innovation_params), st->subframeSize, 
                                 bits, stack);
@@ -1251,24 +1210,21 @@
          }
 
       }
-
-      for (i=0;i<st->subframeSize;i++)
-         sp[i]=exc[i];
-      if (st->lpc_enh_enabled)
+      
+      if (st->innov_save)
       {
-         /* Use enhanced LPC filter */
-         filter_mem2(sp, awk2, awk1, sp, st->subframeSize, st->lpcSize, 
-                     st->mem_sp+st->lpcSize);
-         filter_mem2(sp, awk3, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, 
-                     st->mem_sp);
-      } else {
-         /* Use regular filter */
-         for (i=0;i<st->lpcSize;i++)
-            st->mem_sp[st->lpcSize+i] = 0;
-         iir_mem2(sp, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, 
-                     st->mem_sp);
+         for (i=0;i<st->subframeSize;i++)
+            innov_save[2*i]=exc[i];
       }
-      /*iir_mem2(exc, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, st->mem_sp);*/
+      
+      for (i=0;i<st->subframeSize;i++)
+         sp[i]=st->excBuf[i];
+      iir_mem2(sp, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, 
+               st->mem_sp);
+      for (i=0;i<st->subframeSize;i++)
+         st->excBuf[i]=exc[i];
+      for (i=0;i<st->lpcSize;i++)
+         st->interp_qlpc[i] = ak[i];
 
    }
 
@@ -1345,13 +1301,14 @@
       (*(float*)ptr) = st->vbr_quality;
       break;
    case SPEEX_SET_ABR:
-      st->abr_enabled = (*(int*)ptr);
-      st->vbr_enabled = 1;
+      st->abr_enabled = (*(spx_int32_t*)ptr);
+      st->vbr_enabled = st->abr_enabled!=0;
       speex_encoder_ctl(st->st_low, SPEEX_SET_VBR, &st->vbr_enabled);
+      if (st->vbr_enabled) 
       {
          int i=10, rate, target;
          float vbr_qual;
-         target = (*(int*)ptr);
+         target = (*(spx_int32_t*)ptr);
          while (i>=0)
          {
             speex_encoder_ctl(st, SPEEX_SET_QUALITY, &i);
@@ -1371,7 +1328,7 @@
       
       break;
    case SPEEX_GET_ABR:
-      (*(int*)ptr) = st->abr_enabled;
+      (*(spx_int32_t*)ptr) = st->abr_enabled;
       break;
    case SPEEX_SET_QUALITY:
       {
@@ -1397,8 +1354,9 @@
       break;
    case SPEEX_SET_BITRATE:
       {
-         int i=10, rate, target;
-         target = (*(int*)ptr);
+         int i=10;
+         spx_int32_t rate, target;
+         target = (*(spx_int32_t*)ptr);
          while (i>=0)
          {
             speex_encoder_ctl(st, SPEEX_SET_QUALITY, &i);
@@ -1413,21 +1371,21 @@
       speex_encoder_ctl(st->st_low, request, ptr);
       /*fprintf (stderr, "before: %d\n", (*(int*)ptr));*/
       if (st->submodes[st->submodeID])
-         (*(int*)ptr) += st->sampling_rate*SUBMODE(bits_per_frame)/st->full_frame_size;
+         (*(spx_int32_t*)ptr) += st->sampling_rate*SUBMODE(bits_per_frame)/st->full_frame_size;
       else
-         (*(int*)ptr) += st->sampling_rate*(SB_SUBMODE_BITS+1)/st->full_frame_size;
+         (*(spx_int32_t*)ptr) += st->sampling_rate*(SB_SUBMODE_BITS+1)/st->full_frame_size;
       /*fprintf (stderr, "after: %d\n", (*(int*)ptr));*/
       break;
    case SPEEX_SET_SAMPLING_RATE:
       {
-         int tmp=(*(int*)ptr);
+         spx_int32_t tmp=(*(spx_int32_t*)ptr);
          st->sampling_rate = tmp;
          tmp>>=1;
          speex_encoder_ctl(st->st_low, SPEEX_SET_SAMPLING_RATE, &tmp);
       }
       break;
    case SPEEX_GET_SAMPLING_RATE:
-      (*(int*)ptr)=st->sampling_rate;
+      (*(spx_int32_t*)ptr)=st->sampling_rate;
       break;
    case SPEEX_RESET_STATE:
       {
@@ -1454,6 +1412,45 @@
       speex_encoder_ctl(st->st_low, SPEEX_GET_LOOKAHEAD, ptr);
       (*(int*)ptr) = 2*(*(int*)ptr) + QMF_ORDER - 1;
       break;
+   case SPEEX_SET_PLC_TUNING:
+      speex_encoder_ctl(st->st_low, SPEEX_SET_PLC_TUNING, ptr);
+      break;
+   case SPEEX_GET_PLC_TUNING:
+      speex_encoder_ctl(st->st_low, SPEEX_GET_PLC_TUNING, ptr);
+      break;
+   case SPEEX_SET_VBR_MAX_BITRATE:
+      {
+         st->vbr_max = (*(spx_int32_t*)ptr);
+         if (SPEEX_SET_VBR_MAX_BITRATE<1)
+         {
+            speex_encoder_ctl(st->st_low, SPEEX_SET_VBR_MAX_BITRATE, &st->vbr_max);
+            st->vbr_max_high = 17600;
+         } else {
+            spx_int32_t low_rate;
+            /* FIXME: Need to adapt that to ultra-wideband */
+            if (st->vbr_max >= 42200)
+            {
+               st->vbr_max_high = 17600;
+            } else if (st->vbr_max >= 27800)
+            {
+               st->vbr_max_high = 9600;
+            } else if (st->vbr_max > 20600)
+            {
+               st->vbr_max_high = 5600;
+            } else {
+               st->vbr_max_high = 1800;
+            }
+            low_rate = st->vbr_max - st->vbr_max_high;
+            speex_encoder_ctl(st->st_low, SPEEX_SET_VBR_MAX_BITRATE, &low_rate);
+         }
+      }
+      break;
+   case SPEEX_GET_VBR_MAX_BITRATE:
+      (*(spx_int32_t*)ptr) = st->vbr_max;
+      break;
+
+
+   /* This is all internal stuff past this point */
    case SPEEX_GET_PI_GAIN:
       {
          int i;
@@ -1485,6 +1482,9 @@
    case SPEEX_GET_RELATIVE_QUALITY:
       (*(float*)ptr)=st->relative_quality;
       break;
+   case SPEEX_SET_INNOVATION_SAVE:
+      st->innov_save = ptr;
+      break;
    default:
       speex_warning_int("Unknown nb_ctl request: ", request);
       return -1;
@@ -1534,20 +1534,20 @@
    case SPEEX_GET_BITRATE:
       speex_decoder_ctl(st->st_low, request, ptr);
       if (st->submodes[st->submodeID])
-         (*(int*)ptr) += st->sampling_rate*SUBMODE(bits_per_frame)/st->full_frame_size;
+         (*(spx_int32_t*)ptr) += st->sampling_rate*SUBMODE(bits_per_frame)/st->full_frame_size;
       else
-         (*(int*)ptr) += st->sampling_rate*(SB_SUBMODE_BITS+1)/st->full_frame_size;
+         (*(spx_int32_t*)ptr) += st->sampling_rate*(SB_SUBMODE_BITS+1)/st->full_frame_size;
       break;
    case SPEEX_SET_SAMPLING_RATE:
       {
-         int tmp=(*(int*)ptr);
+         spx_int32_t tmp=(*(spx_int32_t*)ptr);
          st->sampling_rate = tmp;
          tmp>>=1;
          speex_decoder_ctl(st->st_low, SPEEX_SET_SAMPLING_RATE, &tmp);
       }
       break;
    case SPEEX_GET_SAMPLING_RATE:
-      (*(int*)ptr)=st->sampling_rate;
+      (*(spx_int32_t*)ptr)=st->sampling_rate;
       break;
    case SPEEX_SET_HANDLER:
       speex_decoder_ctl(st->st_low, SPEEX_SET_HANDLER, ptr);
@@ -1571,6 +1571,10 @@
    case SPEEX_GET_SUBMODE_ENCODING:
       (*(int*)ptr) = st->encode_submode;
       break;
+   case SPEEX_GET_LOOKAHEAD:
+      speex_decoder_ctl(st->st_low, SPEEX_GET_LOOKAHEAD, ptr);
+      (*(int*)ptr) = 2*(*(int*)ptr);
+      break;
    case SPEEX_GET_PI_GAIN:
       {
          int i;
@@ -1602,6 +1606,9 @@
    case SPEEX_GET_DTX_STATUS:
       speex_decoder_ctl(st->st_low, SPEEX_GET_DTX_STATUS, ptr);
       break;
+   case SPEEX_SET_INNOVATION_SAVE:
+      st->innov_save = ptr;
+      break;
    default:
       speex_warning_int("Unknown nb_ctl request: ", request);
       return -1;
diff --git a/pjmedia/src/pjmedia-codec/speex/sb_celp.h b/pjmedia/src/pjmedia-codec/speex/sb_celp.h
index c283109..194cdd9 100644
--- a/pjmedia/src/pjmedia-codec/speex/sb_celp.h
+++ b/pjmedia/src/pjmedia-codec/speex/sb_celp.h
@@ -42,60 +42,63 @@
 
 /**Structure representing the full state of the sub-band encoder*/
 typedef struct SBEncState {
-   const SpeexMode *mode;            /**< Pointer to the mode (containing for vtable info) */
-   void *st_low;               /**< State of the low-band (narrowband) encoder */
-   int    full_frame_size;     /**< Length of full-band frames*/
-   int    frame_size;          /**< Length of high-band frames*/
-   int    subframeSize;        /**< Length of high-band sub-frames*/
-   int    nbSubframes;         /**< Number of high-band sub-frames*/
-   int    windowSize;          /**< Length of high-band LPC window*/
-   int    lpcSize;             /**< Order of high-band LPC analysis */
-   int    bufSize;             /**< Buffer size */
-   int    first;               /**< First frame? */
-   float  lag_factor;          /**< Lag-windowing control parameter */
-   spx_word16_t  lpc_floor;           /**< Controls LPC analysis noise floor */
-   spx_word16_t  gamma1;              /**< Perceptual weighting coef 1 */
-   spx_word16_t  gamma2;              /**< Perceptual weighting coef 2 */
+   const SpeexMode *mode;         /**< Pointer to the mode (containing for vtable info) */
+   void *st_low;                  /**< State of the low-band (narrowband) encoder */
+   int    full_frame_size;        /**< Length of full-band frames*/
+   int    frame_size;             /**< Length of high-band frames*/
+   int    subframeSize;           /**< Length of high-band sub-frames*/
+   int    nbSubframes;            /**< Number of high-band sub-frames*/
+   int    windowSize;             /**< Length of high-band LPC window*/
+   int    lpcSize;                /**< Order of high-band LPC analysis */
+   int    bufSize;                /**< Buffer size */
+   int    first;                  /**< First frame? */
+   float  lag_factor;             /**< Lag-windowing control parameter */
+   spx_word16_t  lpc_floor;       /**< Controls LPC analysis noise floor */
+   spx_word16_t  gamma1;          /**< Perceptual weighting coef 1 */
+   spx_word16_t  gamma2;          /**< Perceptual weighting coef 2 */
 
-   char  *stack;               /**< Temporary allocation stack */
-   spx_sig_t *x0d, *x1d; /**< QMF filter signals*/
-   spx_sig_t *high;                /**< High-band signal (buffer) */
-   spx_sig_t *y0, *y1;             /**< QMF synthesis signals */
+   char  *stack;                  /**< Temporary allocation stack */
+   spx_sig_t *x0d, *x1d;          /**< QMF filter signals*/
+   spx_sig_t *high;               /**< High-band signal (buffer) */
+   spx_sig_t *y0, *y1;            /**< QMF synthesis signals */
    spx_word16_t *h0_mem, *h1_mem;
    spx_word32_t *g0_mem, *g1_mem; /**< QMF memories */
 
-   spx_sig_t *excBuf;              /**< High-band excitation */
-   spx_sig_t *exc;                 /**< High-band excitation (for QMF only)*/
-   spx_sig_t *res;                 /**< Zero-input response (ringing) */
-   spx_sig_t *sw;                  /**< Perceptually weighted signal */
-   spx_sig_t *target;              /**< Weighted target signal (analysis by synthesis) */
-   spx_word16_t *window;              /**< LPC analysis window */
-   spx_word16_t *lagWindow;           /**< Auto-correlation window */
-   spx_word16_t *autocorr;            /**< Auto-correlation (for LPC analysis) */
-   spx_coef_t *lpc;                 /**< LPC coefficients */
-   spx_lsp_t *lsp;                 /**< LSP coefficients */
-   spx_lsp_t *qlsp;                /**< Quantized LSPs */
-   spx_lsp_t *old_lsp;             /**< LSPs of previous frame */
-   spx_lsp_t *old_qlsp;            /**< Quantized LSPs of previous frame */
-   spx_lsp_t *interp_lsp;          /**< Interpolated LSPs for current sub-frame */
-   spx_lsp_t *interp_qlsp;         /**< Interpolated quantized LSPs for current sub-frame */
-   spx_coef_t *interp_lpc;          /**< Interpolated LPCs for current sub-frame */
-   spx_coef_t *interp_qlpc;         /**< Interpolated quantized LPCs for current sub-frame */
-   spx_coef_t *bw_lpc1;             /**< Bandwidth-expanded version of LPCs (#1) */
-   spx_coef_t *bw_lpc2;             /**< Bandwidth-expanded version of LPCs (#2) */
+   spx_sig_t *excBuf;             /**< High-band excitation */
+   spx_sig_t *exc;                /**< High-band excitation (for QMF only)*/
+   spx_sig_t *res;                /**< Zero-input response (ringing) */
+   spx_sig_t *sw;                 /**< Perceptually weighted signal */
+   const spx_word16_t *window;    /**< LPC analysis window */
+   spx_word16_t *lagWindow;       /**< Auto-correlation window */
+   spx_word16_t *autocorr;        /**< Auto-correlation (for LPC analysis) */
+   spx_coef_t *lpc;               /**< LPC coefficients */
+   spx_lsp_t *lsp;                /**< LSP coefficients */
+   spx_lsp_t *qlsp;               /**< Quantized LSPs */
+   spx_lsp_t *old_lsp;            /**< LSPs of previous frame */
+   spx_lsp_t *old_qlsp;           /**< Quantized LSPs of previous frame */
+   spx_lsp_t *interp_lsp;         /**< Interpolated LSPs for current sub-frame */
+   spx_lsp_t *interp_qlsp;        /**< Interpolated quantized LSPs for current sub-frame */
+   spx_coef_t *interp_lpc;        /**< Interpolated LPCs for current sub-frame */
+   spx_coef_t *interp_qlpc;       /**< Interpolated quantized LPCs for current sub-frame */
+   spx_coef_t *bw_lpc1;           /**< Bandwidth-expanded version of LPCs (#1) */
+   spx_coef_t *bw_lpc2;           /**< Bandwidth-expanded version of LPCs (#2) */
 
-   spx_mem_t *mem_sp;              /**< Synthesis signal memory */
+   spx_mem_t *mem_sp;             /**< Synthesis signal memory */
    spx_mem_t *mem_sp2;
-   spx_mem_t *mem_sw;              /**< Perceptual signal memory */
+   spx_mem_t *mem_sw;             /**< Perceptual signal memory */
    spx_word32_t *pi_gain;
+   spx_sig_t *innov_save;         /**< If non-NULL, innovation is copied here */
+   spx_sig_t *low_innov;          /**< Lower-band innovation is copied here magically */
 
-   float  vbr_quality;         /**< Quality setting for VBR encoding */
-   int    vbr_enabled;         /**< 1 for enabling VBR, 0 otherwise */
-   int    abr_enabled;         /**< ABR setting (in bps), 0 if off */
+   float  vbr_quality;            /**< Quality setting for VBR encoding */
+   int    vbr_enabled;            /**< 1 for enabling VBR, 0 otherwise */
+   spx_int32_t vbr_max;           /**< Max bit-rate allowed in VBR mode (total) */
+   spx_int32_t vbr_max_high;      /**< Max bit-rate allowed in VBR mode for the high-band */
+   spx_int32_t abr_enabled;       /**< ABR setting (in bps), 0 if off */
    float  abr_drift;
    float  abr_drift2;
    float  abr_count;
-   int    vad_enabled;         /**< 1 for enabling VAD, 0 otherwise */
+   int    vad_enabled;            /**< 1 for enabling VAD, 0 otherwise */
    float  relative_quality;
 
    int    encode_submode;
@@ -103,7 +106,7 @@
    int    submodeID;
    int    submodeSelect;
    int    complexity;
-   int    sampling_rate;
+   spx_int32_t sampling_rate;
 
 } SBEncState;
 
@@ -118,7 +121,7 @@
    int    nbSubframes;
    int    lpcSize;
    int    first;
-   int    sampling_rate;
+   spx_int32_t sampling_rate;
    int    lpc_enh_enabled;
 
    char  *stack;
@@ -128,6 +131,7 @@
    spx_word32_t *g0_mem, *g1_mem;
 
    spx_sig_t *exc;
+   spx_sig_t *excBuf;
    spx_lsp_t *qlsp;
    spx_lsp_t *old_qlsp;
    spx_lsp_t *interp_qlsp;
@@ -135,6 +139,9 @@
 
    spx_mem_t *mem_sp;
    spx_word32_t *pi_gain;
+   spx_sig_t *innov_save;      /** If non-NULL, innovation is copied here */
+   spx_sig_t *low_innov;       /** Lower-band innovation is copied here magically */
+   
    spx_int32_t seed;
 
    int    encode_submode;
diff --git a/pjmedia/src/pjmedia-codec/speex/speex.h b/pjmedia/src/pjmedia-codec/speex/speex.h
index 0eb2b8a..c7f7547 100644
--- a/pjmedia/src/pjmedia-codec/speex/speex.h
+++ b/pjmedia/src/pjmedia-codec/speex/speex.h
@@ -141,6 +141,11 @@
 /** Gets tuning for PLC */
 #define SPEEX_GET_PLC_TUNING 41
 
+/** Sets the max bit-rate allowed in VBR mode */
+#define SPEEX_SET_VBR_MAX_BITRATE 42
+/** Gets the max bit-rate allowed in VBR mode */
+#define SPEEX_GET_VBR_MAX_BITRATE 43
+
 /* Used internally, not to be used in applications */
 /** Used internally*/
 #define SPEEX_GET_PI_GAIN 100
@@ -150,6 +155,8 @@
 #define SPEEX_GET_INNOV   102
 /** Used internally*/
 #define SPEEX_GET_DTX_STATUS   103
+/** Used internally*/
+#define SPEEX_SET_INNOVATION_SAVE   104
 
 
 /* Preserving compatibility:*/
diff --git a/pjmedia/src/pjmedia-codec/speex/speex_echo.h b/pjmedia/src/pjmedia-codec/speex/speex_echo.h
index 1962a56..4813b5a 100644
--- a/pjmedia/src/pjmedia-codec/speex/speex_echo.h
+++ b/pjmedia/src/pjmedia-codec/speex/speex_echo.h
@@ -61,7 +61,13 @@
 void speex_echo_state_destroy(SpeexEchoState *st);
 
 /** Performs echo cancellation a frame */
-void speex_echo_cancel(SpeexEchoState *st, short *ref, short *echo, short *out, spx_int32_t *Y);
+void speex_echo_cancel(SpeexEchoState *st, const spx_int16_t *rec, const spx_int16_t *play, spx_int16_t *out, spx_int32_t *Yout);
+
+/** Perform echo cancellation using internal playback buffer */
+void speex_echo_capture(SpeexEchoState *st, const spx_int16_t *rec, spx_int16_t *out, spx_int32_t *Yout);
+
+/** Let the echo canceller know that a frame was just played */
+void speex_echo_playback(SpeexEchoState *st, const spx_int16_t *play);
 
 /** Reset the echo canceller state */
 void speex_echo_state_reset(SpeexEchoState *st);
diff --git a/pjmedia/src/pjmedia-codec/speex/speex_jitter.h b/pjmedia/src/pjmedia-codec/speex/speex_jitter.h
index 31b5c53..34043b3 100644
--- a/pjmedia/src/pjmedia-codec/speex/speex_jitter.h
+++ b/pjmedia/src/pjmedia-codec/speex/speex_jitter.h
@@ -43,32 +43,55 @@
 extern "C" {
 #endif
 
-#define SPEEX_JITTER_MAX_PACKET_SIZE 1500 /**< Maximum number of bytes per packet         */
-#define SPEEX_JITTER_MAX_BUFFER_SIZE 20   /**< Maximum number of packets in jitter buffer */
+struct JitterBuffer_;
 
-#define MAX_MARGIN 12  /**< Number of bins in margin histogram */
+typedef struct JitterBuffer_ JitterBuffer;
+
+typedef struct _JitterBufferPacket JitterBufferPacket;
+
+struct _JitterBufferPacket {
+   char        *data;
+   spx_uint32_t len;
+   spx_uint32_t timestamp;
+   spx_uint32_t span;
+};
+
+
+#define JITTER_BUFFER_OK 0
+#define JITTER_BUFFER_MISSING 1
+#define JITTER_BUFFER_INCOMPLETE 2
+#define JITTER_BUFFER_INTERNAL_ERROR -1
+#define JITTER_BUFFER_BAD_ARGUMENT -2
+
+/** Initialise jitter buffer */
+JitterBuffer *jitter_buffer_init(int tick);
+
+/** Reset jitter buffer */
+void jitter_buffer_reset(JitterBuffer *jitter);
+
+/** Destroy jitter buffer */
+void jitter_buffer_destroy(JitterBuffer *jitter);
+
+/** Put one packet into the jitter buffer */
+void jitter_buffer_put(JitterBuffer *jitter, const JitterBufferPacket *packet);
+
+/** Get one packet from the jitter buffer */
+int jitter_buffer_get(JitterBuffer *jitter, JitterBufferPacket *packet, spx_uint32_t *current_timestamp);
+
+/** Get pointer timestamp of jitter buffer */
+int jitter_buffer_get_pointer_timestamp(JitterBuffer *jitter);
+
+/** Advance by one tick */
+void jitter_buffer_tick(JitterBuffer *jitter);
+
 
 /** Speex jitter-buffer state. */
 typedef struct SpeexJitter {
-   int buffer_size;                                                       /**< Buffer size                         */
-   int pointer_timestamp;                                                 /**< Pointer timestamp                   */
-
    SpeexBits current_packet;                                              /**< Current Speex packet                */
    int valid_bits;                                                        /**< True if Speex bits are valid        */
-
-   char buf[SPEEX_JITTER_MAX_BUFFER_SIZE][SPEEX_JITTER_MAX_PACKET_SIZE];  /**< Buffer of packets                   */
-   int timestamp[SPEEX_JITTER_MAX_BUFFER_SIZE];                           /**< Timestamp of packet                 */
-   int len[SPEEX_JITTER_MAX_BUFFER_SIZE];                                 /**< Number of bytes in packet           */
-
+   JitterBuffer *packets;
    void *dec;                                                             /**< Pointer to Speex decoder            */
    int frame_size;                                                        /**< Frame size of Speex decoder         */
-   int frame_time;                                                        /**< Frame time in [ms] of Speex decoder */
-   int reset_state;                                                       /**< True if Speex state was reset       */
-   
-   int lost_count;                                                        /**< Number of lost packets              */
-   float shortterm_margin[MAX_MARGIN];                                    /**< Short term margins                  */
-   float longterm_margin[MAX_MARGIN];                                     /**< Long term margins                   */
-   float loss_rate;                                                       /**< Loss rate                           */
 } SpeexJitter;
 
 /** Initialise jitter buffer */
@@ -81,7 +104,7 @@
 void speex_jitter_put(SpeexJitter *jitter, char *packet, int len, int timestamp);
 
 /** Get one packet from the jitter buffer */
-void speex_jitter_get(SpeexJitter *jitter, short *out, int *current_timestamp);
+void speex_jitter_get(SpeexJitter *jitter, spx_int16_t *out, int *start_offset);
 
 /** Get pointer timestamp of jitter buffer */
 int speex_jitter_get_pointer_timestamp(SpeexJitter *jitter);
diff --git a/pjmedia/src/pjmedia-codec/speex/speex_stereo.h b/pjmedia/src/pjmedia-codec/speex/speex_stereo.h
index 0b70021..6ccaa31 100644
--- a/pjmedia/src/pjmedia-codec/speex/speex_stereo.h
+++ b/pjmedia/src/pjmedia-codec/speex/speex_stereo.h
@@ -53,7 +53,7 @@
 } SpeexStereoState;
 
 /** Initialization value for a stereo state */
-#define SPEEX_STEREO_STATE_INIT {1,.5,1,1}
+#define SPEEX_STEREO_STATE_INIT {1,.5,1,1,0,0}
 
 /** Transforms a stereo frame into a mono frame and stores intensity stereo info in 'bits' */
 void speex_encode_stereo(float *data, int frame_size, SpeexBits *bits);
diff --git a/pjmedia/src/pjmedia-codec/speex/speex_types.h b/pjmedia/src/pjmedia-codec/speex/speex_types.h
index b67c74f..a6ebf0c 100644
--- a/pjmedia/src/pjmedia-codec/speex/speex_types.h
+++ b/pjmedia/src/pjmedia-codec/speex/speex_types.h
@@ -26,7 +26,6 @@
 
 #  if defined(__CYGWIN__)
 #    include <_G_config.h>
-     typedef _G_int64_t spx_int64_t;
      typedef _G_int32_t spx_int32_t;
      typedef _G_uint32_t spx_uint32_t;
      typedef _G_int16_t spx_int16_t;
@@ -36,17 +35,13 @@
      typedef unsigned short spx_uint16_t;                                                                   
      typedef int spx_int32_t;                                                                               
      typedef unsigned int spx_uint32_t;                                                                     
-     typedef long long spx_int64_t;                                                                         
-     typedef unsigned long long spx_uint64_t;  
 #  elif defined(__MWERKS__)
-     typedef long long spx_int64_t;
      typedef int spx_int32_t;
      typedef unsigned int spx_uint32_t;
      typedef short spx_int16_t;
      typedef unsigned short spx_uint16_t;
 #  else
      /* MSVC/Borland */
-     typedef __int64 spx_int64_t;
      typedef __int32 spx_int32_t;
      typedef unsigned __int32 spx_uint32_t;
      typedef __int16 spx_int16_t;
@@ -60,7 +55,6 @@
    typedef UInt16 spx_uint16_t;
    typedef SInt32 spx_int32_t;
    typedef UInt32 spx_uint32_t;
-   typedef SInt64 spx_int64_t;
 
 #elif defined(__MACOSX__) /* MacOS X Framework build */
 
@@ -69,7 +63,6 @@
    typedef u_int16_t spx_uint16_t;
    typedef int32_t spx_int32_t;
    typedef u_int32_t spx_uint32_t;
-   typedef int64_t spx_int64_t;
 
 #elif defined(__BEOS__)
 
@@ -79,7 +72,6 @@
    typedef u_int16_t spx_uint16_t;
    typedef int32_t spx_int32_t;
    typedef u_int32_t spx_uint32_t;
-   typedef int64_t spx_int64_t;
 
 #elif defined (__EMX__)
 
@@ -88,7 +80,6 @@
    typedef unsigned short spx_uint16_t;
    typedef int spx_int32_t;
    typedef unsigned int spx_uint32_t;
-   typedef long long spx_int64_t;
 
 #elif defined (DJGPP)
 
@@ -96,12 +87,10 @@
    typedef short spx_int16_t;
    typedef int spx_int32_t;
    typedef unsigned int spx_uint32_t;
-   typedef long long spx_int64_t;
 
 #elif defined(R5900)
 
    /* PS2 EE */
-   typedef long spx_int64_t;
    typedef int spx_int32_t;
    typedef unsigned spx_uint32_t;
    typedef short spx_int16_t;
@@ -113,7 +102,6 @@
    typedef unsigned short spx_uint16_t;
    typedef signed int spx_int32_t;
    typedef unsigned int spx_uint32_t;
-   typedef long long int spx_int64_t;
 
 #elif defined(CONFIG_TI_C54X) || defined (CONFIG_TI_C55X)
 
@@ -122,7 +110,7 @@
    typedef long spx_int32_t;
    typedef unsigned long spx_uint32_t;
 
-#elif defined(CONFIG_TI_C5X)
+#elif defined(CONFIG_TI_C6X)
 
    typedef short spx_int16_t;
    typedef unsigned short spx_uint16_t;
diff --git a/pjmedia/src/pjmedia-codec/speex/stack_alloc.h b/pjmedia/src/pjmedia-codec/speex/stack_alloc.h
index 6270d12..cb048fa 100644
--- a/pjmedia/src/pjmedia-codec/speex/stack_alloc.h
+++ b/pjmedia/src/pjmedia-codec/speex/stack_alloc.h
@@ -114,7 +114,7 @@
 #define ALLOC(var, size, type) type var[size]
 #elif defined(USE_ALLOCA)
 #define VARDECL(var) var
-#define ALLOC(var, size, type) var = alloca(sizeof(type)*size)
+#define ALLOC(var, size, type) var = alloca(sizeof(type)*(size))
 #else
 #define VARDECL(var) var
 #define ALLOC(var, size, type) var = PUSH(stack, size, type)
diff --git a/pjmedia/src/pjmedia-codec/speex/window.c b/pjmedia/src/pjmedia-codec/speex/window.c
new file mode 100644
index 0000000..3748f65
--- /dev/null
+++ b/pjmedia/src/pjmedia-codec/speex/window.c
@@ -0,0 +1,94 @@
+/* Copyright (C) 2006 Jean-Marc Valin 
+   File: window.c
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "misc.h"
+
+#ifdef FIXED_POINT
+const spx_word16_t lpc_window[200] = {
+1310, 1313, 1321, 1333, 1352, 1375, 1403, 1436,
+1475, 1518, 1567, 1621, 1679, 1743, 1811, 1884,
+1962, 2044, 2132, 2224, 2320, 2421, 2526, 2636,
+2750, 2868, 2990, 3116, 3246, 3380, 3518, 3659,
+3804, 3952, 4104, 4259, 4417, 4578, 4742, 4909,
+5079, 5251, 5425, 5602, 5781, 5963, 6146, 6331,
+6518, 6706, 6896, 7087, 7280, 7473, 7668, 7863,
+8059, 8256, 8452, 8650, 8847, 9044, 9241, 9438,
+9635, 9831, 10026, 10220, 10414, 10606, 10797, 10987,
+11176, 11363, 11548, 11731, 11912, 12091, 12268, 12443,
+12615, 12785, 12952, 13116, 13277, 13435, 13590, 13742,
+13890, 14035, 14176, 14314, 14448, 14578, 14704, 14826,
+14944, 15058, 15168, 15273, 15374, 15470, 15562, 15649,
+15732, 15810, 15883, 15951, 16015, 16073, 16127, 16175,
+16219, 16257, 16291, 16319, 16342, 16360, 16373, 16381,
+16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384,
+16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384,
+16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384,
+16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384,
+16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384,
+16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384,
+16384, 16384, 16384, 16361, 16294, 16183, 16028, 15830,
+15588, 15304, 14979, 14613, 14207, 13763, 13282, 12766,
+12215, 11631, 11016, 10373, 9702, 9007, 8289, 7551,
+6797, 6028, 5251, 4470, 3695, 2943, 2248, 1696
+};
+#else
+const spx_word16_t lpc_window[200] = {
+   0.080000, 0.080158, 0.080630, 0.081418, 0.082520, 0.083935, 0.085663, 0.087703,
+   0.090052, 0.092710, 0.095674, 0.098943, 0.102514, 0.106385, 0.110553, 0.115015,
+   0.119769, 0.124811, 0.130137, 0.135744, 0.141628, 0.147786, 0.154212, 0.160902,
+   0.167852, 0.175057, 0.182513, 0.190213, 0.198153, 0.206328, 0.214731, 0.223357,
+   0.232200, 0.241254, 0.250513, 0.259970, 0.269619, 0.279453, 0.289466, 0.299651,
+   0.310000, 0.320507, 0.331164, 0.341965, 0.352901, 0.363966, 0.375151, 0.386449,
+   0.397852, 0.409353, 0.420943, 0.432615, 0.444361, 0.456172, 0.468040, 0.479958,
+   0.491917, 0.503909, 0.515925, 0.527959, 0.540000, 0.552041, 0.564075, 0.576091,
+   0.588083, 0.600042, 0.611960, 0.623828, 0.635639, 0.647385, 0.659057, 0.670647,
+   0.682148, 0.693551, 0.704849, 0.716034, 0.727099, 0.738035, 0.748836, 0.759493,
+   0.770000, 0.780349, 0.790534, 0.800547, 0.810381, 0.820030, 0.829487, 0.838746,
+   0.847800, 0.856643, 0.865269, 0.873672, 0.881847, 0.889787, 0.897487, 0.904943,
+   0.912148, 0.919098, 0.925788, 0.932214, 0.938372, 0.944256, 0.949863, 0.955189,
+   0.960231, 0.964985, 0.969447, 0.973615, 0.977486, 0.981057, 0.984326, 0.987290,
+   0.989948, 0.992297, 0.994337, 0.996065, 0.997480, 0.998582, 0.999370, 0.999842,
+   1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+   1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+   1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+   1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+   1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+   1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+   1.000000, 1.000000, 1.000000, 0.998640, 0.994566, 0.987787, 0.978324, 0.966203,
+   0.951458, 0.934131, 0.914270, 0.891931, 0.867179, 0.840084, 0.810723, 0.779182,
+   0.745551, 0.709930, 0.672424, 0.633148, 0.592223, 0.549781, 0.505964, 0.460932,
+   0.414863, 0.367968, 0.320511, 0.272858, 0.225569, 0.179655, 0.137254, 0.103524
+};
+#endif
diff --git a/pjmedia/src/pjmedia-codec/speex_codec.c b/pjmedia/src/pjmedia-codec/speex_codec.c
index 6a64c77..b434ee8 100644
--- a/pjmedia/src/pjmedia-codec/speex_codec.c
+++ b/pjmedia/src/pjmedia-codec/speex_codec.c
@@ -683,9 +683,7 @@
 				     struct pjmedia_frame *output)
 {
     struct spx_private *spx;
-    float tmp[642]; /* 20ms at 32KHz + 2 */
-    pj_int16_t *samp_in;
-    unsigned i, samp_count, sz;
+    unsigned sz;
     int tx;
 
     spx = (struct spx_private*) codec->codec_data;
@@ -698,19 +696,11 @@
 	return PJ_SUCCESS;
     }
 
-    /* Copy frame to float buffer. */
-    samp_count = input->size / 2;
-    pj_assert(samp_count <= PJ_ARRAY_SIZE(tmp));
-    samp_in = input->buf;
-    for (i=0; i<samp_count; ++i) {
-	tmp[i] = samp_in[i];
-    }
-
     /* Flush all the bits in the struct so we can encode a new frame */
     speex_bits_reset(&spx->enc_bits);
 
     /* Encode the frame */
-    tx = speex_encode(spx->enc, tmp, &spx->enc_bits);
+    tx = speex_encode_int(spx->enc, input->buf, &spx->enc_bits);
 
     /* Check if we need not to transmit the frame (DTX) */
     if (tx == 0) {
@@ -743,9 +733,6 @@
 				     struct pjmedia_frame *output)
 {
     struct spx_private *spx;
-    float tmp[642]; /* 20ms at 32KHz + 2 */
-    pj_int16_t *dst_buf;
-    unsigned i, count, sz;
 
     spx = (struct spx_private*) codec->codec_data;
 
@@ -764,21 +751,11 @@
     speex_bits_read_from(&spx->dec_bits, input->buf, input->size);
 
     /* Decode the data */
-    speex_decode(spx->dec, &spx->dec_bits, tmp);
+    speex_decode_int(spx->dec, &spx->dec_bits, output->buf);
 
-    /* Check size. */
-    sz = speex_bits_nbytes(&spx->enc_bits);
-    pj_assert(sz <= output_buf_len);
-
-    /* Copy from float to short samples. */
-    count = spx_factory.speex_param[spx->param_id].clock_rate * 20 / 1000;
-    pj_assert((count <= output_buf_len/2) && count <= PJ_ARRAY_SIZE(tmp));
-    dst_buf = output->buf;
-    for (i=0; i<count; ++i) {
-	dst_buf[i] = (pj_int16_t)tmp[i];
-    }
     output->type = PJMEDIA_FRAME_TYPE_AUDIO;
-    output->size = count * 2;
+    output->size = speex_bits_nbytes(&spx->dec_bits);
+    pj_assert(output->size <= (int)output_buf_len);
     output->timestamp.u64 = input->timestamp.u64;
 
 
@@ -793,9 +770,7 @@
 				      struct pjmedia_frame *output)
 {
     struct spx_private *spx;
-    float tmp[642]; /* 20ms at 32KHz + 2 */
-    pj_int16_t *dst_buf;
-    unsigned i, count;
+    unsigned count;
 
     /* output_buf_len is unreferenced when building in Release mode */
     PJ_UNUSED_ARG(output_buf_len);
@@ -803,16 +778,11 @@
     spx = (struct spx_private*) codec->codec_data;
 
     count = spx_factory.speex_param[spx->param_id].clock_rate * 20 / 1000;
-    pj_assert((count <= output_buf_len/2) && count <= PJ_ARRAY_SIZE(tmp));
+    pj_assert(count <= output_buf_len/2);
 
     /* Recover packet loss */
-    speex_decode(spx->dec, NULL, tmp);
+    speex_decode_int(spx->dec, NULL, output->buf);
 
-    /* Copy from float to short samples. */
-    dst_buf = output->buf;
-    for (i=0; i<count; ++i) {
-	dst_buf[i] = (pj_int16_t)tmp[i];
-    }
     output->size = count * 2;
 
     return PJ_SUCCESS;