More ticket #774: optimization for siren7/siren14 codecs

git-svn-id: https://svn.pjsip.org/repos/pjproject/trunk@2616 74dad513-b988-da41-8d7b-12977e46ad98
diff --git a/third_party/g7221/decode/coef2sam.c b/third_party/g7221/decode/coef2sam.c
index a52095d..87deab0 100644
--- a/third_party/g7221/decode/coef2sam.c
+++ b/third_party/g7221/decode/coef2sam.c
@@ -88,7 +88,7 @@
 
     
 
-    half_dct_size = shr(dct_length,1);
+    half_dct_size = shr_nocheck(dct_length,1);
     
     /* Perform a Type IV (inverse) DCT on the coefficients */
     dct_type_iv_s(coefs, new_samples, dct_length);
@@ -98,7 +98,7 @@
     {
         for(index=0;index<dct_length;index++)
         {
-            new_samples[index] = shr(new_samples[index],mag_shift);
+            new_samples[index] = shr_nocheck(new_samples[index],mag_shift);
             move16();
         }
     }
@@ -110,7 +110,7 @@
             mag_shift = negate(mag_shift);
             for(index=0;index<dct_length;index++)
             {
-                new_samples[index] = shl(new_samples[index],mag_shift);
+                new_samples[index] = shl_nocheck(new_samples[index],mag_shift);
                 move16();
             }
         }
@@ -147,7 +147,7 @@
         move32();
         sum = L_mac(sum,*win_new++, *--new_ptr);
         sum = L_mac(sum,*--win_old, *old_ptr++);
-        *out_ptr++ = itu_round(L_shl(sum,2));
+        *out_ptr++ = itu_round(L_shl_nocheck(sum,2));
         move16();
 
     }
@@ -160,7 +160,7 @@
         move32();
         sum = L_mac(sum,*win_new++, *new_ptr++);
         sum = L_mac(sum,negate(*--win_old), *--old_ptr);
-        *out_ptr++ = itu_round(L_shl(sum,2));
+        *out_ptr++ = itu_round(L_shl_nocheck(sum,2));
         move16();
     }
         
diff --git a/third_party/g7221/decode/dct4_s.c b/third_party/g7221/decode/dct4_s.c
index 82405a3..0123a13 100644
--- a/third_party/g7221/decode/dct4_s.c
+++ b/third_party/g7221/decode/dct4_s.c
@@ -140,9 +140,9 @@
         /*===========================================================*/
         
         /*    set_span      = 1 << (DCT_LENGTH_LOG - set_count_log); */
-        set_span = shr(dct_length,set_count_log);
+        set_span = shr_nocheck(dct_length,set_count_log);
            
-        set_count     = shl(1,set_count_log);
+        set_count     = shl_nocheck(1,set_count_log);
         in_ptr        = in_buffer;
         move16();
         next_out_base = out_buffer;
@@ -185,12 +185,18 @@
                     /* IF THIS WORKS, IT'S PREFERABLE */
                         
                     dummy = add(in_val_low,dither_ptr[i++]);
-                    acca = L_add(dummy,in_val_high);
-                    out_val_low = extract_l(L_shr(acca,1));
+		    // blp: addition of two 16bits vars, there's no way
+		    //      they'll overflow a 32bit var
+                    //acca = L_add(dummy,in_val_high);
+		    acca = dummy + in_val_high;
+                    out_val_low = extract_l(L_shr_nocheck(acca,1));
                     
                     dummy = add(in_val_low,dither_ptr[i++]);
-                    acca = L_add(dummy,-in_val_high);
-                    out_val_high = extract_l(L_shr(acca,1));
+		    // blp: addition of two 16bits vars, there's no way
+		    //      they'll overflow a 32bit var
+                    //acca = L_add(dummy,-in_val_high);
+		    acca = dummy - in_val_high;
+                    out_val_high = extract_l(L_shr_nocheck(acca,1));
                     
                     *out_ptr_low++  = out_val_low;
                     move16();
@@ -284,6 +290,17 @@
     {
         for ( k=0; k<CORE_SIZE; k++ )
         {
+#if PJ_HAS_INT64
+	    /* blp: danger danger! not really compatible but faster */
+	    pj_int64_t sum64=0;
+            move32();
+            
+            for ( i=0; i<CORE_SIZE; i++ )
+            {
+                sum64 += L_mult(pair_ptr[i], dct_core_s[i][k]);
+            }
+	    sum = L_saturate(sum64);
+#else
             sum=0L;
             move32();
             
@@ -291,6 +308,7 @@
             {
                 sum = L_mac(sum, pair_ptr[i],dct_core_s[i][k]);
             }
+#endif
             buffer_swap[k] = itu_round(sum);
         }
         
@@ -323,9 +341,9 @@
         /*===========================================================*/
         
         /*    set_span      = 1 << (DCT_LENGTH_LOG - set_count_log); */
-        set_span = shr(dct_length,set_count_log);
+        set_span = shr_nocheck(dct_length,set_count_log);
         
-        set_count     = shl(1,set_count_log);
+        set_count     = shl_nocheck(1,set_count_log);
         next_in_base  = in_buffer;
         move16();
         test();
@@ -354,7 +372,7 @@
             in_ptr_low     = next_in_base;
             move16();
             
-            temp = shr(set_span,1);
+            temp = shr_nocheck(set_span,1);
             in_ptr_high    = in_ptr_low + temp;
             move16();
             
@@ -401,25 +419,25 @@
                 
                 sum = L_mac(sum,cos_even,in_low_even);
                 sum = L_mac(sum,negate(msin_even),in_high_even);
-                out_low_even = itu_round(L_shl(sum,1));
+                out_low_even = itu_round(L_shl_nocheck(sum,1));
                 
                 sum = 0L;
                 move32();
                 sum = L_mac(sum,msin_even,in_low_even);
                 sum = L_mac(sum,cos_even,in_high_even);
-                out_high_even = itu_round(L_shl(sum,1));
+                out_high_even = itu_round(L_shl_nocheck(sum,1));
                 
                 sum = 0L;
                 move32();
                 sum = L_mac(sum,cos_odd,in_low_odd);
                 sum = L_mac(sum,msin_odd,in_high_odd);
-                out_low_odd = itu_round(L_shl(sum,1));
+                out_low_odd = itu_round(L_shl_nocheck(sum,1));
                 
                 sum = 0L;
                 move32();
                 sum = L_mac(sum,msin_odd,in_low_odd);
                 sum = L_mac(sum,negate(cos_odd),in_high_odd);
-                out_high_odd = itu_round(L_shl(sum,1));
+                out_high_odd = itu_round(L_shl_nocheck(sum,1));
                 
                 *out_ptr_low++  = out_low_even;
                 move16();
@@ -458,7 +476,10 @@
     {
         for(i=0;i<320;i++) 
         {
-           sum = L_add(output[i],syn_bias_7khz[i]);
+	   // blp: addition of two 16bits vars, there's no way
+	   //      they'll overflow a 32bit var
+           //sum = L_add(output[i],syn_bias_7khz[i]);
+	   sum = output[i] + syn_bias_7khz[i];
            acca = L_sub(sum,32767);
            test();
            if (acca > 0) 
@@ -466,7 +487,10 @@
                sum = 32767L;
                move32();
            }
-           acca = L_add(sum,32768L);
+	   // blp: addition of two 16bits vars, there's no way
+	   //      they'll overflow 32bit var
+           //acca = L_add(sum,32768L);
+	   acca = sum + 32768;
            test();
            if (acca < 0) 
            {
diff --git a/third_party/g7221/decode/decoder.c b/third_party/g7221/decode/decoder.c
index c6b8c06..d642a91 100644
--- a/third_party/g7221/decode/decoder.c
+++ b/third_party/g7221/decode/decoder.c
@@ -136,7 +136,7 @@
         for (i=0; i<num_categorization_control_bits; i++) 
         {
         	get_next_bit(bitobj);
-        	categorization_control = shl(categorization_control,1);
+        	categorization_control = shl_nocheck(categorization_control,1);
         	categorization_control = add(categorization_control,bitobj->next_bit);
         }
         
@@ -246,7 +246,7 @@
     for (i=0; i<5; i++) 
     {
         get_next_bit(bitobj);
-        index = shl(index,1);
+        index = shl_nocheck(index,1);
         index = add(index,bitobj->next_bit);
     }
     bitobj->number_of_bits_left = sub(bitobj->number_of_bits_left,5);
@@ -332,7 +332,7 @@
     while ((i >= 0) && ((temp1 >= 0) || (temp2 > 0))) 
     {
         i = sub(i,1);
-        temp = shr(temp,1);
+        temp = shr_nocheck(temp,1);
         max_index = sub(max_index,2);
         temp1 = sub(temp,8);
         temp2 = sub(max_index,28);
@@ -530,13 +530,13 @@
                     test();
                     if (bitobj->next_bit == 0)
 	                {
-                        temp = shl(index,1);
+                        temp = shl_nocheck(index,1);
                         index = (Word16)*(decoder_table_ptr + temp);
                         move16();
                     }
 	                else
 	                {
-                        temp = shl(index,1);
+                        temp = shl_nocheck(index,1);
                         index = (Word16)*(decoder_table_ptr + temp + 1);
                         move16();
                     }
@@ -567,18 +567,18 @@
                         for (j=0; j<num_sign_bits; j++) 
                         {
 		                    get_next_bit(bitobj);
-       		                signs_index = shl(signs_index,1);
+       		                signs_index = shl_nocheck(signs_index,1);
 		                    signs_index = add(signs_index,bitobj->next_bit);
 		                    bitobj->number_of_bits_left = sub(bitobj->number_of_bits_left,1);
 	                    }
 	                    temp = sub(num_sign_bits,1);
-                        bit = shl(1,(temp));
+                        bit = shl_nocheck(1,(temp));
 	                }
 	                
                     for (j=0; j<vec_dim; j++) 
                     {
 	                    acca = L_mult0(standard_deviation,mlt_quant_centroid[category][k[j]]);
-                        acca = L_shr(acca,12);
+                        acca = L_shr_nocheck(acca,12);
                         decoder_mlt_value = extract_l(acca);
 	                    
                         test();
@@ -587,7 +587,7 @@
 		                    test();
                             if ((signs_index & bit) == 0)
 		                        decoder_mlt_value = negate(decoder_mlt_value);
-		                    bit = shr(bit,1);
+		                    bit = shr_nocheck(bit,1);
 	                    }
                         *decoder_mlt_ptr++ = decoder_mlt_value;
                         move16();
@@ -652,7 +652,7 @@
                     }
 	                *decoder_mlt_ptr = temp1;
                     move16();
-	                random_word = shr(random_word,1);
+	                random_word = shr_nocheck(random_word,1);
 	            }
 	            /* pointer arithmetic */
                 decoder_mlt_ptr++;
@@ -677,7 +677,7 @@
                     }
 	                *decoder_mlt_ptr = temp1;
                     move16();
-	                random_word  = shr(random_word,1);
+	                random_word  = shr_nocheck(random_word,1);
 	            }
 	            /* pointer arithmetic */
                 decoder_mlt_ptr++;
@@ -710,7 +710,7 @@
                 }
                 *decoder_mlt_ptr++ = temp1;
                 move16();
-                random_word = shr(random_word,1);
+                random_word = shr_nocheck(random_word,1);
             }
             random_word = get_rand(randobj);
             for (j=0; j<10; j++) 
@@ -730,7 +730,7 @@
                 
                 *decoder_mlt_ptr++ = temp1;
                 move16();
-                random_word = shr(random_word,1);
+                random_word = shr_nocheck(random_word,1);
             }
         }
     }
@@ -1059,7 +1059,7 @@
         move16();
     }
     bitobj->code_bit_count = sub(bitobj->code_bit_count,1);
-    temp = shr(bitobj->current_word,bitobj->code_bit_count);
+    temp = shr_nocheck(bitobj->current_word,bitobj->code_bit_count);
     logic16();
     bitobj->next_bit = (Word16 )(temp & 1);