$OpenBSD: patch-libavcodec_mips_aaccoder_mips_c,v 1.5 2016/01/14 06:09:43 ajacoutot Exp $

AAC encoder: refactor to resynchronize MIPS port

AAC encoder: simplify and speed up find_min_book

AAC encoder: Extensive improvements

AAC encoder: memoize quantize_band_cost

avcodec/mips/aaccoder_mips: Disable ff_aac_coder_init_mips() to prevent build failure

avcodec/mips/aaccoder_mips: Sync with the generic code

AAC encoder: improve SF range utilization

AAC encoder: various fixes in M/S coding

--- libavcodec/mips/aaccoder_mips.c.orig	Fri Dec 25 03:00:19 2015
+++ libavcodec/mips/aaccoder_mips.c	Wed Jan 13 15:13:20 2016
@@ -62,6 +62,8 @@
 #include "libavcodec/aac.h"
 #include "libavcodec/aacenc.h"
 #include "libavcodec/aactab.h"
+#include "libavcodec/aacenctab.h"
+#include "libavcodec/aacenc_utils.h"
 
 #if HAVE_INLINE_ASM
 typedef struct BandCodingPath {
@@ -70,21 +72,6 @@ typedef struct BandCodingPath {
     int run;
 } BandCodingPath;
 
-static const uint8_t run_value_bits_long[64] = {
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 10,
-    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15
-};
-
-static const uint8_t run_value_bits_short[16] = {
-    3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9
-};
-
-static const uint8_t * const run_value_bits[2] = {
-    run_value_bits_long, run_value_bits_short
-};
-
 static const uint8_t uquad_sign_bits[81] = {
     0, 1, 1, 1, 2, 2, 1, 2, 2,
     1, 2, 2, 2, 3, 3, 2, 3, 3,
@@ -144,65 +131,6 @@ static const uint8_t esc_sign_bits[289] = {
     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 };
 
-#define ROUND_STANDARD 0.4054f
-#define ROUND_TO_ZERO 0.1054f
-
-static void abs_pow34_v(float *out, const float *in, const int size) {
-#ifndef USE_REALLY_FULL_SEARCH
-    int i;
-    float a, b, c, d;
-    float ax, bx, cx, dx;
-
-    for (i = 0; i < size; i += 4) {
-        a = fabsf(in[i  ]);
-        b = fabsf(in[i+1]);
-        c = fabsf(in[i+2]);
-        d = fabsf(in[i+3]);
-
-        ax = sqrtf(a);
-        bx = sqrtf(b);
-        cx = sqrtf(c);
-        dx = sqrtf(d);
-
-        a = a * ax;
-        b = b * bx;
-        c = c * cx;
-        d = d * dx;
-
-        out[i  ] = sqrtf(a);
-        out[i+1] = sqrtf(b);
-        out[i+2] = sqrtf(c);
-        out[i+3] = sqrtf(d);
-    }
-#endif /* USE_REALLY_FULL_SEARCH */
-}
-
-static float find_max_val(int group_len, int swb_size, const float *scaled) {
-    float maxval = 0.0f;
-    int w2, i;
-    for (w2 = 0; w2 < group_len; w2++) {
-        for (i = 0; i < swb_size; i++) {
-            maxval = FFMAX(maxval, scaled[w2*128+i]);
-        }
-    }
-    return maxval;
-}
-
-static int find_min_book(float maxval, int sf) {
-    float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
-    float Q34 = sqrtf(Q * sqrtf(Q));
-    int qmaxval, cb;
-    qmaxval = maxval * Q34 + 0.4054f;
-    if      (qmaxval ==  0) cb = 0;
-    else if (qmaxval ==  1) cb = 1;
-    else if (qmaxval ==  2) cb = 3;
-    else if (qmaxval <=  4) cb = 5;
-    else if (qmaxval <=  7) cb = 7;
-    else if (qmaxval <= 12) cb = 9;
-    else                    cb = 11;
-    return cb;
-}
-
 /**
  * Functions developed from template function and optimized for quantizing and encoding band
  */
@@ -210,14 +138,17 @@ static void quantize_and_encode_band_cost_SQUAD_mips(s
                                                      PutBitContext *pb, const float *in, float *out,
                                                      const float *scaled, int size, int scale_idx,
                                                      int cb, const float lambda, const float uplim,
-                                                     int *bits, const float ROUNDING)
+                                                     int *bits, float *energy, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
 
     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
 
     abs_pow34_v(s->scoefs, in, size);
     scaled = s->scoefs;
@@ -225,6 +156,7 @@ static void quantize_and_encode_band_cost_SQUAD_mips(s
         int curidx;
         int *in_int = (int *)&in[i];
         int t0, t1, t2, t3, t4, t5, t6, t7;
+        const float *vec;
 
         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
@@ -276,21 +208,43 @@ static void quantize_and_encode_band_cost_SQUAD_mips(s
         curidx += 40;
 
         put_bits(pb, p_bits[curidx], p_codes[curidx]);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec = &p_vec[curidx*4];
+            e1 = vec[0] * IQ;
+            e2 = vec[1] * IQ;
+            e3 = vec[2] * IQ;
+            e4 = vec[3] * IQ;
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
     }
+    if (energy)
+        *energy = qenergy;
 }
 
 static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
                                                      PutBitContext *pb, const float *in, float *out,
                                                      const float *scaled, int size, int scale_idx,
                                                      int cb, const float lambda, const float uplim,
-                                                     int *bits, const float ROUNDING)
+                                                     int *bits, float *energy, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
 
     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
 
     abs_pow34_v(s->scoefs, in, size);
     scaled = s->scoefs;
@@ -300,6 +254,7 @@ static void quantize_and_encode_band_cost_UQUAD_mips(s
         uint8_t v_bits;
         unsigned int v_codes;
         int t0, t1, t2, t3, t4;
+        const float *vec;
 
         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
@@ -368,21 +323,43 @@ static void quantize_and_encode_band_cost_UQUAD_mips(s
         v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
         v_bits  = p_bits[curidx] + count;
         put_bits(pb, v_bits, v_codes);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec = &p_vec[curidx*4];
+            e1 = copysignf(vec[0] * IQ, in[i+0]);
+            e2 = copysignf(vec[1] * IQ, in[i+1]);
+            e3 = copysignf(vec[2] * IQ, in[i+2]);
+            e4 = copysignf(vec[3] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
     }
+    if (energy)
+        *energy = qenergy;
 }
 
 static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
                                                      PutBitContext *pb, const float *in, float *out,
                                                      const float *scaled, int size, int scale_idx,
                                                      int cb, const float lambda, const float uplim,
-                                                     int *bits, const float ROUNDING)
+                                                     int *bits, float *energy, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
 
     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
 
     abs_pow34_v(s->scoefs, in, size);
     scaled = s->scoefs;
@@ -392,6 +369,7 @@ static void quantize_and_encode_band_cost_SPAIR_mips(s
         uint8_t v_bits;
         unsigned int v_codes;
         int t0, t1, t2, t3, t4, t5, t6, t7;
+        const float *vec1, *vec2;
 
         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
@@ -447,30 +425,54 @@ static void quantize_and_encode_band_cost_SPAIR_mips(s
         v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
         v_bits  = p_bits[curidx] + p_bits[curidx2];
         put_bits(pb, v_bits, v_codes);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx*2 ];
+            vec2 = &p_vec[curidx2*2];
+            e1 = vec1[0] * IQ;
+            e2 = vec1[1] * IQ;
+            e3 = vec2[0] * IQ;
+            e4 = vec2[1] * IQ;
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
     }
+    if (energy)
+        *energy = qenergy;
 }
 
 static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
                                                       PutBitContext *pb, const float *in, float *out,
                                                       const float *scaled, int size, int scale_idx,
                                                       int cb, const float lambda, const float uplim,
-                                                      int *bits, const float ROUNDING)
+                                                      int *bits, float *energy, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
 
     uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
 
     abs_pow34_v(s->scoefs, in, size);
     scaled = s->scoefs;
     for (i = 0; i < size; i += 4) {
-        int curidx, sign1, count1, sign2, count2;
+        int curidx1, curidx2, sign1, count1, sign2, count2;
         int *in_int = (int *)&in[i];
         uint8_t v_bits;
         unsigned int v_codes;
         int t0, t1, t2, t3, t4;
+        const float *vec1, *vec2;
 
         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
@@ -528,43 +530,67 @@ static void quantize_and_encode_band_cost_UPAIR7_mips(
               "memory"
         );
 
-        curidx  = 8 * qc1;
-        curidx += qc2;
+        curidx1  = 8 * qc1;
+        curidx1 += qc2;
 
-        v_codes = (p_codes[curidx] << count1) | sign1;
-        v_bits  = p_bits[curidx] + count1;
+        v_codes = (p_codes[curidx1] << count1) | sign1;
+        v_bits  = p_bits[curidx1] + count1;
         put_bits(pb, v_bits, v_codes);
 
-        curidx  = 8 * qc3;
-        curidx += qc4;
+        curidx2  = 8 * qc3;
+        curidx2 += qc4;
 
-        v_codes = (p_codes[curidx] << count2) | sign2;
-        v_bits  = p_bits[curidx] + count2;
+        v_codes = (p_codes[curidx2] << count2) | sign2;
+        v_bits  = p_bits[curidx2] + count2;
         put_bits(pb, v_bits, v_codes);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx1*2];
+            vec2 = &p_vec[curidx2*2];
+            e1 = copysignf(vec1[0] * IQ, in[i+0]);
+            e2 = copysignf(vec1[1] * IQ, in[i+1]);
+            e3 = copysignf(vec2[0] * IQ, in[i+2]);
+            e4 = copysignf(vec2[1] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
     }
+    if (energy)
+        *energy = qenergy;
 }
 
 static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
                                                        PutBitContext *pb, const float *in, float *out,
                                                        const float *scaled, int size, int scale_idx,
                                                        int cb, const float lambda, const float uplim,
-                                                       int *bits, const float ROUNDING)
+                                                       int *bits, float *energy, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
 
     uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
+    float    *p_vec   = (float   *)ff_aac_codebook_vectors[cb-1];
 
     abs_pow34_v(s->scoefs, in, size);
     scaled = s->scoefs;
     for (i = 0; i < size; i += 4) {
-        int curidx, sign1, count1, sign2, count2;
+        int curidx1, curidx2, sign1, count1, sign2, count2;
         int *in_int = (int *)&in[i];
         uint8_t v_bits;
         unsigned int v_codes;
         int t0, t1, t2, t3, t4;
+        const float *vec1, *vec2;
 
         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
@@ -621,31 +647,53 @@ static void quantize_and_encode_band_cost_UPAIR12_mips
             : "memory"
         );
 
-        curidx  = 13 * qc1;
-        curidx += qc2;
+        curidx1  = 13 * qc1;
+        curidx1 += qc2;
 
-        v_codes = (p_codes[curidx] << count1) | sign1;
-        v_bits  = p_bits[curidx] + count1;
+        v_codes = (p_codes[curidx1] << count1) | sign1;
+        v_bits  = p_bits[curidx1] + count1;
         put_bits(pb, v_bits, v_codes);
 
-        curidx  = 13 * qc3;
-        curidx += qc4;
+        curidx2  = 13 * qc3;
+        curidx2 += qc4;
 
-        v_codes = (p_codes[curidx] << count2) | sign2;
-        v_bits  = p_bits[curidx] + count2;
+        v_codes = (p_codes[curidx2] << count2) | sign2;
+        v_bits  = p_bits[curidx2] + count2;
         put_bits(pb, v_bits, v_codes);
+
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx1*2];
+            vec2 = &p_vec[curidx2*2];
+            e1 = copysignf(vec1[0] * IQ, in[i+0]);
+            e2 = copysignf(vec1[1] * IQ, in[i+1]);
+            e3 = copysignf(vec2[0] * IQ, in[i+2]);
+            e4 = copysignf(vec2[1] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+        }
     }
+    if (energy)
+        *energy = qenergy;
 }
 
 static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
                                                    PutBitContext *pb, const float *in, float *out,
                                                    const float *scaled, int size, int scale_idx,
                                                    int cb, const float lambda, const float uplim,
-                                                   int *bits, const float ROUNDING)
+                                                   int *bits, float *energy, const float ROUNDING)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
+    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;
 
     uint8_t  *p_bits    = (uint8_t* )ff_aac_spectral_bits[cb-1];
     uint16_t *p_codes   = (uint16_t*)ff_aac_spectral_codes[cb-1];
@@ -661,6 +709,7 @@ static void quantize_and_encode_band_cost_ESC_mips(str
             uint8_t v_bits;
             unsigned int v_codes;
             int t0, t1, t2, t3, t4;
+            const float *vec1, *vec2;
 
             qc1 = scaled[i  ] * Q34 + ROUNDING;
             qc2 = scaled[i+1] * Q34 + ROUNDING;
@@ -729,6 +778,24 @@ static void quantize_and_encode_band_cost_ESC_mips(str
             v_codes = (p_codes[curidx2] << count2) | sign2;
             v_bits  = p_bits[curidx2] + count2;
             put_bits(pb, v_bits, v_codes);
+
+            if (out || energy) {
+                float e1,e2,e3,e4;
+                vec1 = &p_vectors[curidx*2 ];
+                vec2 = &p_vectors[curidx2*2];
+                e1 = copysignf(vec1[0] * IQ, in[i+0]);
+                e2 = copysignf(vec1[1] * IQ, in[i+1]);
+                e3 = copysignf(vec2[0] * IQ, in[i+2]);
+                e4 = copysignf(vec2[1] * IQ, in[i+3]);
+                if (out) {
+                    out[i+0] = e1;
+                    out[i+1] = e2;
+                    out[i+2] = e3;
+                    out[i+3] = e4;
+                }
+                if (energy)
+                    qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+            }
         }
     } else {
         for (i = 0; i < size; i += 4) {
@@ -839,15 +906,33 @@ static void quantize_and_encode_band_cost_ESC_mips(str
                 v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
                 put_bits(pb, len * 2 - 3, v_codes);
             }
+
+            if (out || energy) {
+                float e1, e2, e3, e4;
+                e1 = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]);
+                e2 = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]);
+                e3 = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]);
+                e4 = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]);
+                if (out) {
+                    out[i+0] = e1;
+                    out[i+1] = e2;
+                    out[i+2] = e3;
+                    out[i+3] = e4;
+                }
+                if (energy)
+                    qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
+            }
         }
     }
+    if (energy)
+        *energy = qenergy;
 }
 
 static void quantize_and_encode_band_cost_NONE_mips(struct AACEncContext *s,
                                                          PutBitContext *pb, const float *in, float *out,
                                                          const float *scaled, int size, int scale_idx,
                                                          int cb, const float lambda, const float uplim,
-                                                         int *bits, const float ROUNDING) {
+                                                         int *bits, float *energy, const float ROUNDING) {
     av_assert0(0);
 }
 
@@ -855,7 +940,7 @@ static void quantize_and_encode_band_cost_ZERO_mips(st
                                                          PutBitContext *pb, const float *in, float *out,
                                                          const float *scaled, int size, int scale_idx,
                                                          int cb, const float lambda, const float uplim,
-                                                         int *bits, const float ROUNDING) {
+                                                         int *bits, float *energy, const float ROUNDING) {
     int i;
     if (bits)
         *bits = 0;
@@ -867,13 +952,15 @@ static void quantize_and_encode_band_cost_ZERO_mips(st
            out[i+3] = 0.0f;
         }
     }
+    if (energy)
+        *energy = 0.0f;
 }
 
 static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
                                                          PutBitContext *pb, const float *in, float *out,
                                                          const float *scaled, int size, int scale_idx,
                                                          int cb, const float lambda, const float uplim,
-                                                         int *bits, const float ROUNDING) = {
+                                                         int *bits, float *energy, const float ROUNDING) = {
     quantize_and_encode_band_cost_ZERO_mips,
     quantize_and_encode_band_cost_SQUAD_mips,
     quantize_and_encode_band_cost_SQUAD_mips,
@@ -894,17 +981,17 @@ static void (*const quantize_and_encode_band_cost_arr[
 
 #define quantize_and_encode_band_cost(                                       \
                                 s, pb, in, out, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits, ROUNDING)               \
+                                lambda, uplim, bits, energy, ROUNDING)       \
     quantize_and_encode_band_cost_arr[cb](                                   \
                                 s, pb, in, out, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits, ROUNDING)
+                                lambda, uplim, bits, energy, ROUNDING)
 
 static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
                                           const float *in, float *out, int size, int scale_idx,
                                           int cb, const float lambda, int rtz)
 {
     quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
-                                  INFINITY, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
+                                  INFINITY, NULL, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
 }
 
 /**
@@ -1384,7 +1471,7 @@ static float (*const get_band_numbits_arr[])(struct AA
 static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
                                      const float *scaled, int size, int scale_idx,
                                      int cb, const float lambda, const float uplim,
-                                     int *bits)
+                                     int *bits, float *energy, int rtz)
 {
     return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
 }
@@ -1397,7 +1484,7 @@ static float get_band_cost_ZERO_mips(struct AACEncCont
                                      PutBitContext *pb, const float *in,
                                      const float *scaled, int size, int scale_idx,
                                      int cb, const float lambda, const float uplim,
-                                     int *bits)
+                                     int *bits, float *energy)
 {
     int i;
     float cost = 0;
@@ -1410,6 +1497,8 @@ static float get_band_cost_ZERO_mips(struct AACEncCont
     }
     if (bits)
         *bits = 0;
+    if (energy)
+        *energy = 0.0f;
     return cost * lambda;
 }
 
@@ -1417,7 +1506,7 @@ static float get_band_cost_NONE_mips(struct AACEncCont
                                      PutBitContext *pb, const float *in,
                                      const float *scaled, int size, int scale_idx,
                                      int cb, const float lambda, const float uplim,
-                                     int *bits)
+                                     int *bits, float *energy)
 {
     av_assert0(0);
     return 0;
@@ -1427,12 +1516,13 @@ static float get_band_cost_SQUAD_mips(struct AACEncCon
                                       PutBitContext *pb, const float *in,
                                       const float *scaled, int size, int scale_idx,
                                       int cb, const float lambda, const float uplim,
-                                      int *bits)
+                                      int *bits, float *energy)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     float cost = 0;
+    float qenergy = 0.0f;
     int qc1, qc2, qc3, qc4;
     int curbits = 0;
 
@@ -1499,6 +1589,9 @@ static float get_band_cost_SQUAD_mips(struct AACEncCon
         curbits += p_bits[curidx];
         vec     = &p_codes[curidx*4];
 
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec[2]*vec[2] + vec[3]*vec[3];
+
         __asm__ volatile (
             ".set push                                  \n\t"
             ".set noreorder                             \n\t"
@@ -1533,6 +1626,8 @@ static float get_band_cost_SQUAD_mips(struct AACEncCon
 
     if (bits)
         *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
     return cost * lambda + curbits;
 }
 
@@ -1540,12 +1635,13 @@ static float get_band_cost_UQUAD_mips(struct AACEncCon
                                       PutBitContext *pb, const float *in,
                                       const float *scaled, int size, int scale_idx,
                                       int cb, const float lambda, const float uplim,
-                                      int *bits)
+                                      int *bits, float *energy)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     float cost = 0;
+    float qenergy = 0.0f;
     int curbits = 0;
     int qc1, qc2, qc3, qc4;
 
@@ -1598,6 +1694,9 @@ static float get_band_cost_UQUAD_mips(struct AACEncCon
         curbits += uquad_sign_bits[curidx];
         vec     = &p_codes[curidx*4];
 
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec[2]*vec[2] + vec[3]*vec[3];
+
         __asm__ volatile (
             ".set push                                  \n\t"
             ".set noreorder                             \n\t"
@@ -1635,6 +1734,8 @@ static float get_band_cost_UQUAD_mips(struct AACEncCon
 
     if (bits)
         *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
     return cost * lambda + curbits;
 }
 
@@ -1642,12 +1743,13 @@ static float get_band_cost_SPAIR_mips(struct AACEncCon
                                       PutBitContext *pb, const float *in,
                                       const float *scaled, int size, int scale_idx,
                                       int cb, const float lambda, const float uplim,
-                                      int *bits)
+                                      int *bits, float *energy)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     float cost = 0;
+    float qenergy = 0.0f;
     int qc1, qc2, qc3, qc4;
     int curbits = 0;
 
@@ -1719,6 +1821,9 @@ static float get_band_cost_SPAIR_mips(struct AACEncCon
         vec     = &p_codes[curidx*2];
         vec2    = &p_codes[curidx2*2];
 
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
         __asm__ volatile (
             ".set push                                  \n\t"
             ".set noreorder                             \n\t"
@@ -1753,6 +1858,8 @@ static float get_band_cost_SPAIR_mips(struct AACEncCon
 
     if (bits)
         *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
     return cost * lambda + curbits;
 }
 
@@ -1760,12 +1867,13 @@ static float get_band_cost_UPAIR7_mips(struct AACEncCo
                                        PutBitContext *pb, const float *in,
                                        const float *scaled, int size, int scale_idx,
                                        int cb, const float lambda, const float uplim,
-                                       int *bits)
+                                       int *bits, float *energy)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     float cost = 0;
+    float qenergy = 0.0f;
     int qc1, qc2, qc3, qc4;
     int curbits = 0;
 
@@ -1849,6 +1957,9 @@ static float get_band_cost_UPAIR7_mips(struct AACEncCo
         curbits += upair7_sign_bits[curidx2];
         vec2    = &p_codes[curidx2*2];
 
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
         __asm__ volatile (
             ".set push                                          \n\t"
             ".set noreorder                                     \n\t"
@@ -1886,6 +1997,8 @@ static float get_band_cost_UPAIR7_mips(struct AACEncCo
 
     if (bits)
         *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
     return cost * lambda + curbits;
 }
 
@@ -1893,12 +2006,13 @@ static float get_band_cost_UPAIR12_mips(struct AACEncC
                                         PutBitContext *pb, const float *in,
                                         const float *scaled, int size, int scale_idx,
                                         int cb, const float lambda, const float uplim,
-                                        int *bits)
+                                        int *bits, float *energy)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     int i;
     float cost = 0;
+    float qenergy = 0.0f;
     int qc1, qc2, qc3, qc4;
     int curbits = 0;
 
@@ -1982,6 +2096,9 @@ static float get_band_cost_UPAIR12_mips(struct AACEncC
         vec     = &p_codes[curidx*2];
         vec2    = &p_codes[curidx2*2];
 
+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
         __asm__ volatile (
             ".set push                                          \n\t"
             ".set noreorder                                     \n\t"
@@ -2019,6 +2136,8 @@ static float get_band_cost_UPAIR12_mips(struct AACEncC
 
     if (bits)
         *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
     return cost * lambda + curbits;
 }
 
@@ -2026,13 +2145,14 @@ static float get_band_cost_ESC_mips(struct AACEncConte
                                     PutBitContext *pb, const float *in,
                                     const float *scaled, int size, int scale_idx,
                                     int cb, const float lambda, const float uplim,
-                                    int *bits)
+                                    int *bits, float *energy)
 {
     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
     const float CLIPPED_ESCAPE = 165140.0f * IQ;
     int i;
     float cost = 0;
+    float qenergy = 0.0f;
     int qc1, qc2, qc3, qc4;
     int curbits = 0;
 
@@ -2042,7 +2162,7 @@ static float get_band_cost_ESC_mips(struct AACEncConte
     for (i = 0; i < size; i += 4) {
         const float *vec, *vec2;
         int curidx, curidx2;
-        float t1, t2, t3, t4;
+        float t1, t2, t3, t4, V;
         float di1, di2, di3, di4;
         int cond0, cond1, cond2, cond3;
         int c1, c2, c3, c4;
@@ -2114,38 +2234,54 @@ static float get_band_cost_ESC_mips(struct AACEncConte
         if (cond0) {
             if (t1 >= CLIPPED_ESCAPE) {
                 di1 = t1 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
             } else {
-                di1 = t1 - c1 * cbrtf(c1) * IQ;
+                di1 = t1 - (V = c1 * cbrtf(c1) * IQ);
+                qenergy += V*V;
             }
-        } else
-            di1 = t1 - vec[0] * IQ;
+        } else {
+            di1 = t1 - (V = vec[0] * IQ);
+            qenergy += V*V;
+        }
 
         if (cond1) {
             if (t2 >= CLIPPED_ESCAPE) {
                 di2 = t2 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
             } else {
-                di2 = t2 - c2 * cbrtf(c2) * IQ;
+                di2 = t2 - (V = c2 * cbrtf(c2) * IQ);
+                qenergy += V*V;
             }
-        } else
-            di2 = t2 - vec[1] * IQ;
+        } else {
+            di2 = t2 - (V = vec[1] * IQ);
+            qenergy += V*V;
+        }
 
         if (cond2) {
             if (t3 >= CLIPPED_ESCAPE) {
                 di3 = t3 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
             } else {
-                di3 = t3 - c3 * cbrtf(c3) * IQ;
+                di3 = t3 - (V = c3 * cbrtf(c3) * IQ);
+                qenergy += V*V;
             }
-        } else
-            di3 = t3 - vec2[0] * IQ;
+        } else {
+            di3 = t3 - (V = vec2[0] * IQ);
+            qenergy += V*V;
+        }
 
         if (cond3) {
             if (t4 >= CLIPPED_ESCAPE) {
                 di4 = t4 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
             } else {
-                di4 = t4 - c4 * cbrtf(c4) * IQ;
+                di4 = t4 - (V = c4 * cbrtf(c4) * IQ);
+                qenergy += V*V;
             }
-        } else
-            di4 = t4 - vec2[1]*IQ;
+        } else {
+            di4 = t4 - (V = vec2[1]*IQ);
+            qenergy += V*V;
+        }
 
         cost += di1 * di1 + di2 * di2
                 + di3 * di3 + di4 * di4;
@@ -2160,7 +2296,7 @@ static float (*const get_band_cost_arr[])(struct AACEn
                                           PutBitContext *pb, const float *in,
                                           const float *scaled, int size, int scale_idx,
                                           int cb, const float lambda, const float uplim,
-                                          int *bits) = {
+                                          int *bits, float *energy) = {
     get_band_cost_ZERO_mips,
     get_band_cost_SQUAD_mips,
     get_band_cost_SQUAD_mips,
@@ -2181,404 +2317,182 @@ static float (*const get_band_cost_arr[])(struct AACEn
 
 #define get_band_cost(                                  \
                                 s, pb, in, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits)                    \
+                                lambda, uplim, bits, energy)            \
     get_band_cost_arr[cb](                              \
                                 s, pb, in, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits)
+                                lambda, uplim, bits, energy)
 
 static float quantize_band_cost(struct AACEncContext *s, const float *in,
                                 const float *scaled, int size, int scale_idx,
                                 int cb, const float lambda, const float uplim,
-                                int *bits)
+                                int *bits, float *energy, int rtz)
 {
-    return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
+    return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits, energy);
 }
 
-static void search_for_quantizers_twoloop_mips(AVCodecContext *avctx,
-                                               AACEncContext *s,
-                                               SingleChannelElement *sce,
-                                               const float lambda)
-{
-    int start = 0, i, w, w2, g;
-    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels;
-    float dists[128] = { 0 }, uplims[128];
-    float maxvals[128];
-    int fflag, minscaler;
-    int its  = 0;
-    int allz = 0;
-    float minthr = INFINITY;
+#include "libavcodec/aacenc_quantization_misc.h"
 
-    destbits = FFMIN(destbits, 5800);
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            int nz = 0;
-            float uplim = 0.0f;
-            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
-                uplim += band->threshold;
-                if (band->energy <= band->threshold || band->threshold == 0.0f) {
-                    sce->zeroes[(w+w2)*16+g] = 1;
-                    continue;
-                }
-                nz = 1;
-            }
-            uplims[w*16+g] = uplim *512;
-            sce->zeroes[w*16+g] = !nz;
-            if (nz)
-                minthr = FFMIN(minthr, uplim);
-            allz |= nz;
-        }
-    }
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            if (sce->zeroes[w*16+g]) {
-                sce->sf_idx[w*16+g] = SCALE_ONE_POS;
-                continue;
-            }
-            sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
-        }
-    }
+#include "libavcodec/aaccoder_twoloop.h"
 
-    if (!allz)
-        return;
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
-
-    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        start = w*128;
-        for (g = 0;  g < sce->ics.num_swb; g++) {
-            const float *scaled = s->scoefs + start;
-            maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
-            start += sce->ics.swb_sizes[g];
-        }
-    }
-
-    do {
-        int tbits, qstep;
-        minscaler = sce->sf_idx[0];
-        qstep = its ? 1 : 32;
-        do {
-            int prev = -1;
-            tbits = 0;
-            fflag = 0;
-
-            if (qstep > 1) {
-                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-                    start = w*128;
-                    for (g = 0;  g < sce->ics.num_swb; g++) {
-                        const float *coefs = sce->coeffs + start;
-                        const float *scaled = s->scoefs + start;
-                        int bits = 0;
-                        int cb;
-
-                        if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
-                            start += sce->ics.swb_sizes[g];
-                            continue;
-                        }
-                        minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
-                        cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
-                        for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                            int b;
-                            bits += quantize_band_cost_bits(s, coefs + w2*128,
-                                                            scaled + w2*128,
-                                                            sce->ics.swb_sizes[g],
-                                                            sce->sf_idx[w*16+g],
-                                                            cb,
-                                                            1.0f,
-                                                            INFINITY,
-                                                            &b);
-                        }
-                        if (prev != -1) {
-                            bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
-                        }
-                        tbits += bits;
-                        start += sce->ics.swb_sizes[g];
-                        prev = sce->sf_idx[w*16+g];
-                    }
-                }
-            }
-            else {
-                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-                    start = w*128;
-                    for (g = 0;  g < sce->ics.num_swb; g++) {
-                        const float *coefs = sce->coeffs + start;
-                        const float *scaled = s->scoefs + start;
-                        int bits = 0;
-                        int cb;
-                        float dist = 0.0f;
-
-                        if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
-                            start += sce->ics.swb_sizes[g];
-                            continue;
-                        }
-                        minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
-                        cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
-                        for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                            int b;
-                            dist += quantize_band_cost(s, coefs + w2*128,
-                                                       scaled + w2*128,
-                                                       sce->ics.swb_sizes[g],
-                                                       sce->sf_idx[w*16+g],
-                                                       cb,
-                                                       1.0f,
-                                                       INFINITY,
-                                                       &b);
-                            bits += b;
-                        }
-                        dists[w*16+g] = dist - bits;
-                        if (prev != -1) {
-                            bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
-                        }
-                        tbits += bits;
-                        start += sce->ics.swb_sizes[g];
-                        prev = sce->sf_idx[w*16+g];
-                    }
-                }
-            }
-            if (tbits > destbits) {
-                for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] < 218 - qstep)
-                        sce->sf_idx[i] += qstep;
-            } else {
-                for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] > 60 - qstep)
-                        sce->sf_idx[i] -= qstep;
-            }
-            qstep >>= 1;
-            if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
-                qstep = 1;
-        } while (qstep);
-
-        fflag = 0;
-        minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
-        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-            for (g = 0; g < sce->ics.num_swb; g++) {
-                int prevsc = sce->sf_idx[w*16+g];
-                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
-                    if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
-                        sce->sf_idx[w*16+g]--;
-                    else
-                        sce->sf_idx[w*16+g]-=2;
-                }
-                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
-                sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
-                if (sce->sf_idx[w*16+g] != prevsc)
-                    fflag = 1;
-                sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
-            }
-        }
-        its++;
-    } while (fflag && its < 10);
-}
-
 static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
 {
-    int start = 0, i, w, w2, g;
+    int start = 0, i, w, w2, g, sid_sf_boost, prev_mid, prev_side;
+    uint8_t nextband0[128], nextband1[128];
     float M[128], S[128];
     float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
+    const float lambda = s->lambda;
+    const float mslambda = FFMIN(1.0f, lambda / 120.f);
     SingleChannelElement *sce0 = &cpe->ch[0];
     SingleChannelElement *sce1 = &cpe->ch[1];
     if (!cpe->common_window)
         return;
+
+    /** Scout out next nonzero bands */
+    ff_init_nextband_map(sce0, nextband0);
+    ff_init_nextband_map(sce1, nextband1);
+
+    prev_mid = sce0->sf_idx[0];
+    prev_side = sce1->sf_idx[0];
     for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
+        start = 0;
         for (g = 0;  g < sce0->ics.num_swb; g++) {
-            if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
-                float dist1 = 0.0f, dist2 = 0.0f;
+            float bmax = bval2bmax(g * 17.0f / sce0->ics.num_swb) / 0.0045f;
+            if (!cpe->is_mask[w*16+g])
+                cpe->ms_mask[w*16+g] = 0;
+            if (!sce0->zeroes[w*16+g] && !sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g]) {
+                float Mmax = 0.0f, Smax = 0.0f;
+
+                /* Must compute mid/side SF and book for the whole window group */
                 for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
-                    FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
-                    FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
-                    float minthr = FFMIN(band0->threshold, band1->threshold);
-                    float maxthr = FFMAX(band0->threshold, band1->threshold);
-                    for (i = 0; i < sce0->ics.swb_sizes[g]; i+=4) {
-                        M[i  ] = (sce0->coeffs[start+w2*128+i  ]
-                                + sce1->coeffs[start+w2*128+i  ]) * 0.5;
-                        M[i+1] = (sce0->coeffs[start+w2*128+i+1]
-                                + sce1->coeffs[start+w2*128+i+1]) * 0.5;
-                        M[i+2] = (sce0->coeffs[start+w2*128+i+2]
-                                + sce1->coeffs[start+w2*128+i+2]) * 0.5;
-                        M[i+3] = (sce0->coeffs[start+w2*128+i+3]
-                                + sce1->coeffs[start+w2*128+i+3]) * 0.5;
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                        M[i] = (sce0->coeffs[start+(w+w2)*128+i]
+                              + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
+                        S[i] =  M[i]
+                              - sce1->coeffs[start+(w+w2)*128+i];
+                    }
+                    abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
+                    abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++ ) {
+                        Mmax = FFMAX(Mmax, M34[i]);
+                        Smax = FFMAX(Smax, S34[i]);
+                    }
+                }
 
-                        S[i  ] =  M[i  ]
-                                - sce1->coeffs[start+w2*128+i  ];
-                        S[i+1] =  M[i+1]
-                                - sce1->coeffs[start+w2*128+i+1];
-                        S[i+2] =  M[i+2]
-                                - sce1->coeffs[start+w2*128+i+2];
-                        S[i+3] =  M[i+3]
-                                - sce1->coeffs[start+w2*128+i+3];
-                   }
-                    abs_pow34_v(L34, sce0->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(R34, sce1->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
-                    dist1 += quantize_band_cost(s, sce0->coeffs + start + w2*128,
-                                                L34,
-                                                sce0->ics.swb_sizes[g],
-                                                sce0->sf_idx[(w+w2)*16+g],
-                                                sce0->band_type[(w+w2)*16+g],
-                                                s->lambda / band0->threshold, INFINITY, NULL);
-                    dist1 += quantize_band_cost(s, sce1->coeffs + start + w2*128,
-                                                R34,
-                                                sce1->ics.swb_sizes[g],
-                                                sce1->sf_idx[(w+w2)*16+g],
-                                                sce1->band_type[(w+w2)*16+g],
-                                                s->lambda / band1->threshold, INFINITY, NULL);
-                    dist2 += quantize_band_cost(s, M,
-                                                M34,
-                                                sce0->ics.swb_sizes[g],
-                                                sce0->sf_idx[(w+w2)*16+g],
-                                                sce0->band_type[(w+w2)*16+g],
-                                                s->lambda / maxthr, INFINITY, NULL);
-                    dist2 += quantize_band_cost(s, S,
-                                                S34,
-                                                sce1->ics.swb_sizes[g],
-                                                sce1->sf_idx[(w+w2)*16+g],
-                                                sce1->band_type[(w+w2)*16+g],
-                                                s->lambda / minthr, INFINITY, NULL);
+                for (sid_sf_boost = 0; sid_sf_boost < 4; sid_sf_boost++) {
+                    float dist1 = 0.0f, dist2 = 0.0f;
+                    int B0 = 0, B1 = 0;
+                    int minidx;
+                    int mididx, sididx;
+                    int midcb, sidcb;
+
+                    minidx = FFMIN(sce0->sf_idx[w*16+g], sce1->sf_idx[w*16+g]);
+                    mididx = av_clip(minidx, 0, SCALE_MAX_POS - SCALE_DIV_512);
+                    sididx = av_clip(minidx - sid_sf_boost * 3, 0, SCALE_MAX_POS - SCALE_DIV_512);
+                    if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT
+                        && (   !ff_sfdelta_can_replace(sce0, nextband0, prev_mid, mididx, w*16+g)
+                            || !ff_sfdelta_can_replace(sce1, nextband1, prev_side, sididx, w*16+g))) {
+                        /* scalefactor range violation, bad stuff, will decrease quality unacceptably */
+                        continue;
+                    }
+
+                    midcb = find_min_book(Mmax, mididx);
+                    sidcb = find_min_book(Smax, sididx);
+
+                    /* No CB can be zero */
+                    midcb = FFMAX(1,midcb);
+                    sidcb = FFMAX(1,sidcb);
+
+                    for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                        FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
+                        FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
+                        float minthr = FFMIN(band0->threshold, band1->threshold);
+                        int b1,b2,b3,b4;
+                        for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                            M[i] = (sce0->coeffs[start+(w+w2)*128+i]
+                                  + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
+                            S[i] =  M[i]
+                                  - sce1->coeffs[start+(w+w2)*128+i];
+                        }
+
+                        abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
+                        dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
+                                                    L34,
+                                                    sce0->ics.swb_sizes[g],
+                                                    sce0->sf_idx[w*16+g],
+                                                    sce0->band_type[w*16+g],
+                                                    lambda / band0->threshold, INFINITY, &b1, NULL, 0);
+                        dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
+                                                    R34,
+                                                    sce1->ics.swb_sizes[g],
+                                                    sce1->sf_idx[w*16+g],
+                                                    sce1->band_type[w*16+g],
+                                                    lambda / band1->threshold, INFINITY, &b2, NULL, 0);
+                        dist2 += quantize_band_cost(s, M,
+                                                    M34,
+                                                    sce0->ics.swb_sizes[g],
+                                                    mididx,
+                                                    midcb,
+                                                    lambda / minthr, INFINITY, &b3, NULL, 0);
+                        dist2 += quantize_band_cost(s, S,
+                                                    S34,
+                                                    sce1->ics.swb_sizes[g],
+                                                    sididx,
+                                                    sidcb,
+                                                    mslambda / (minthr * bmax), INFINITY, &b4, NULL, 0);
+                        B0 += b1+b2;
+                        B1 += b3+b4;
+                        dist1 -= b1+b2;
+                        dist2 -= b3+b4;
+                    }
+                    cpe->ms_mask[w*16+g] = dist2 <= dist1 && B1 < B0;
+                    if (cpe->ms_mask[w*16+g]) {
+                        if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT) {
+                            sce0->sf_idx[w*16+g] = mididx;
+                            sce1->sf_idx[w*16+g] = sididx;
+                            sce0->band_type[w*16+g] = midcb;
+                            sce1->band_type[w*16+g] = sidcb;
+                        } else if ((sce0->band_type[w*16+g] != NOISE_BT) ^ (sce1->band_type[w*16+g] != NOISE_BT)) {
+                            /* ms_mask unneeded, and it confuses some decoders */
+                            cpe->ms_mask[w*16+g] = 0;
+                        }
+                        break;
+                    } else if (B1 > B0) {
+                        /* More boost won't fix this */
+                        break;
+                    }
                 }
-                cpe->ms_mask[w*16+g] = dist2 < dist1;
             }
+            if (!sce0->zeroes[w*16+g] && sce0->band_type[w*16+g] < RESERVED_BT)
+                prev_mid = sce0->sf_idx[w*16+g];
+            if (!sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g] && sce1->band_type[w*16+g] < RESERVED_BT)
+                prev_side = sce1->sf_idx[w*16+g];
             start += sce0->ics.swb_sizes[g];
         }
     }
 }
 #endif /*HAVE_MIPSFPU */
 
-static void codebook_trellis_rate_mips(AACEncContext *s, SingleChannelElement *sce,
-                                       int win, int group_len, const float lambda)
-{
-    BandCodingPath path[120][12];
-    int w, swb, cb, start, size;
-    int i, j;
-    const int max_sfb  = sce->ics.max_sfb;
-    const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
-    const int run_esc  = (1 << run_bits) - 1;
-    int idx, ppos, count;
-    int stackrun[120], stackcb[120], stack_len;
-    float next_minbits = INFINITY;
-    int next_mincb = 0;
+#include "libavcodec/aaccoder_trellis.h"
 
-    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
-    start = win*128;
-    for (cb = 0; cb < 12; cb++) {
-        path[0][cb].cost     = run_bits+4;
-        path[0][cb].prev_idx = -1;
-        path[0][cb].run      = 0;
-    }
-    for (swb = 0; swb < max_sfb; swb++) {
-        size = sce->ics.swb_sizes[swb];
-        if (sce->zeroes[win*16 + swb]) {
-            float cost_stay_here = path[swb][0].cost;
-            float cost_get_here  = next_minbits + run_bits + 4;
-            if (   run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
-                != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
-                cost_stay_here += run_bits;
-            if (cost_get_here < cost_stay_here) {
-                path[swb+1][0].prev_idx = next_mincb;
-                path[swb+1][0].cost     = cost_get_here;
-                path[swb+1][0].run      = 1;
-            } else {
-                path[swb+1][0].prev_idx = 0;
-                path[swb+1][0].cost     = cost_stay_here;
-                path[swb+1][0].run      = path[swb][0].run + 1;
-            }
-            next_minbits = path[swb+1][0].cost;
-            next_mincb = 0;
-            for (cb = 1; cb < 12; cb++) {
-                path[swb+1][cb].cost = 61450;
-                path[swb+1][cb].prev_idx = -1;
-                path[swb+1][cb].run = 0;
-            }
-        } else {
-            float minbits = next_minbits;
-            int mincb = next_mincb;
-            int startcb = sce->band_type[win*16+swb];
-            next_minbits = INFINITY;
-            next_mincb = 0;
-            for (cb = 0; cb < startcb; cb++) {
-                path[swb+1][cb].cost = 61450;
-                path[swb+1][cb].prev_idx = -1;
-                path[swb+1][cb].run = 0;
-            }
-            for (cb = startcb; cb < 12; cb++) {
-                float cost_stay_here, cost_get_here;
-                float bits = 0.0f;
-                for (w = 0; w < group_len; w++) {
-                    bits += quantize_band_cost_bits(s, sce->coeffs + start + w*128,
-                                                    s->scoefs + start + w*128, size,
-                                                    sce->sf_idx[(win+w)*16+swb], cb,
-                                                    0, INFINITY, NULL);
-                }
-                cost_stay_here = path[swb][cb].cost + bits;
-                cost_get_here  = minbits            + bits + run_bits + 4;
-                if (   run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
-                    != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
-                    cost_stay_here += run_bits;
-                if (cost_get_here < cost_stay_here) {
-                    path[swb+1][cb].prev_idx = mincb;
-                    path[swb+1][cb].cost     = cost_get_here;
-                    path[swb+1][cb].run      = 1;
-                } else {
-                    path[swb+1][cb].prev_idx = cb;
-                    path[swb+1][cb].cost     = cost_stay_here;
-                    path[swb+1][cb].run      = path[swb][cb].run + 1;
-                }
-                if (path[swb+1][cb].cost < next_minbits) {
-                    next_minbits = path[swb+1][cb].cost;
-                    next_mincb = cb;
-                }
-            }
-        }
-        start += sce->ics.swb_sizes[swb];
-    }
-
-    stack_len = 0;
-    idx       = 0;
-    for (cb = 1; cb < 12; cb++)
-        if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
-            idx = cb;
-    ppos = max_sfb;
-    while (ppos > 0) {
-        av_assert1(idx >= 0);
-        cb = idx;
-        stackrun[stack_len] = path[ppos][cb].run;
-        stackcb [stack_len] = cb;
-        idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
-        ppos -= path[ppos][cb].run;
-        stack_len++;
-    }
-
-    start = 0;
-    for (i = stack_len - 1; i >= 0; i--) {
-        put_bits(&s->pb, 4, stackcb[i]);
-        count = stackrun[i];
-        memset(sce->zeroes + win*16 + start, !stackcb[i], count);
-        for (j = 0; j < count; j++) {
-            sce->band_type[win*16 + start] =  stackcb[i];
-            start++;
-        }
-        while (count >= run_esc) {
-            put_bits(&s->pb, run_bits, run_esc);
-            count -= run_esc;
-        }
-        put_bits(&s->pb, run_bits, count);
-    }
-}
 #endif /* HAVE_INLINE_ASM */
 
 void ff_aac_coder_init_mips(AACEncContext *c) {
 #if HAVE_INLINE_ASM
     AACCoefficientsEncoder *e = c->coder;
-    int option = c->options.aac_coder;
+    int option = c->options.coder;
 
     if (option == 2) {
-// Disabled due to failure with fate-aac-pns-encode
-//         e->quantize_and_encode_band = quantize_and_encode_band_mips;
-//         e->encode_window_bands_info = codebook_trellis_rate_mips;
+        e->quantize_and_encode_band = quantize_and_encode_band_mips;
+        e->encode_window_bands_info = codebook_trellis_rate;
 #if HAVE_MIPSFPU
-        e->search_for_quantizers    = search_for_quantizers_twoloop_mips;
-        e->search_for_ms            = search_for_ms_mips;
+        e->search_for_quantizers    = search_for_quantizers_twoloop;
 #endif /* HAVE_MIPSFPU */
     }
+#if HAVE_MIPSFPU
+    e->search_for_ms            = search_for_ms_mips;
+#endif /* HAVE_MIPSFPU */
 #endif /* HAVE_INLINE_ASM */
 }
