More ticket #590: optimizations in the echo suppressor, gained more than 3x speed. Added entries in mips_test

git-svn-id: https://svn.pjsip.org/repos/pjproject/trunk@2212 74dad513-b988-da41-8d7b-12977e46ad98
diff --git a/pjmedia/src/pjmedia/echo_suppress.c b/pjmedia/src/pjmedia/echo_suppress.c
index d084378..19e48b9 100644
--- a/pjmedia/src/pjmedia/echo_suppress.c
+++ b/pjmedia/src/pjmedia/echo_suppress.c
@@ -243,6 +243,12 @@
     float	*tmp_corr;	    /* Temporary corr array calculation	    */
     float	 best_corr;	    /* Best correlation so far.		    */
 
+    unsigned	 sum_rec_level;	    /* Running sum of level in rec_hist	    */
+    float	 rec_corr;	    /* Running corr in rec_hist.	    */
+
+    unsigned	 sum_play_level0;   /* Running sum of level for first pos   */
+    float	 play_corr0;	    /* Running corr for first pos .	    */
+
     float	*min_factor;	    /* Array of minimum scaling factor	    */
     float	*avg_factor;	    /* Array of average scaling factor	    */
     float	*tmp_factor;	    /* Array to store provisional result    */
@@ -353,6 +359,8 @@
     ec->last_factor = 1.0;
     ec->residue = 0;
     ec->running_cnt = 0;
+    ec->sum_rec_level = ec->sum_play_level0 = 0;
+    ec->rec_corr = ec->play_corr0 = 0;
 }
 
 /*
@@ -375,6 +383,8 @@
     ec->best_corr = MAX_FLOAT;
     ec->residue = 0;
     ec->running_cnt = 0;
+    ec->sum_rec_level = ec->sum_play_level0 = 0;
+    ec->rec_corr = ec->play_corr0 = 0;
 
     PJ_LOG(4,(THIS_FILE, "Echo suppressor soft reset. Re-learning.."));
 }
@@ -403,8 +413,9 @@
 			     const pj_int16_t *play_frm)
 {
     int prev_index;
-    unsigned i, frm_level, sum_rec_level;
-    float rec_corr;
+    unsigned i, j, frm_level, sum_play_level, ulaw;
+    pj_uint16_t old_rec_frm_level, old_play_frm_level;
+    float play_corr;
 
     ++ec->update_cnt;
     if (ec->update_cnt > 0x7FFFFFFF)
@@ -414,6 +425,9 @@
     frm_level = pjmedia_calc_avg_signal(play_frm, ec->samples_per_segment);
     ++frm_level; /* to avoid division by zero */
 
+    /* Save the oldest frame level for later */
+    old_play_frm_level = ec->play_hist[0];
+
     /* Push current frame level to the back of the play history */
     pj_array_erase(ec->play_hist, sizeof(pj_uint16_t), ec->play_hist_cnt, 0);
     ec->play_hist[ec->play_hist_cnt-1] = (pj_uint16_t) frm_level;
@@ -422,6 +436,9 @@
     frm_level = pjmedia_calc_avg_signal(rec_frm, ec->samples_per_segment);
     ++frm_level; /* to avoid division by zero */
 
+    /* Save the oldest frame level for later */
+    old_rec_frm_level = ec->rec_hist[0];
+
     /* Push to the back of the rec history */
     pj_array_erase(ec->rec_hist, sizeof(pj_uint16_t), ec->templ_cnt, 0);
     ec->rec_hist[ec->templ_cnt-1] = (pj_uint16_t) frm_level;
@@ -437,33 +454,86 @@
 
 
     /* Calculate rec signal pattern */
-    rec_corr = 0;
-    sum_rec_level = 0;
-    for (i=0; i < ec->templ_cnt-1; ++i) {
-	float corr;
-	corr = (float)ec->rec_hist[i+1] / ec->rec_hist[i];
-	rec_corr += corr;
-	sum_rec_level += ec->rec_hist[i];
+    if (ec->sum_rec_level == 0) {
+	/* Buffer has just been filled up, do full calculation */
+	ec->rec_corr = 0;
+	ec->sum_rec_level = 0;
+	for (i=0; i < ec->templ_cnt-1; ++i) {
+	    float corr;
+	    corr = (float)ec->rec_hist[i+1] / ec->rec_hist[i];
+	    ec->rec_corr += corr;
+	    ec->sum_rec_level += ec->rec_hist[i];
+	}
+	ec->sum_rec_level += ec->rec_hist[i];
+    } else {
+	/* Update from previous calculation */
+	ec->sum_rec_level = ec->sum_rec_level - old_rec_frm_level + 
+			    ec->rec_hist[ec->templ_cnt-1];
+	ec->rec_corr = ec->rec_corr - ((float)ec->rec_hist[0] / 
+					      old_rec_frm_level) +
+		       ((float)ec->rec_hist[ec->templ_cnt-1] /
+			       ec->rec_hist[ec->templ_cnt-2]);
     }
-    sum_rec_level += ec->rec_hist[i];
 
     /* Iterate through the play history and calculate the signal correlation
      * for every tail position in the play_hist. Save the result in temporary
      * array since we may bail out early if the conversation state is not good
      * to detect echo.
      */
-    for (i=0; i < ec->tail_cnt; ++i) {
-	unsigned j, end, sum_play_level, ulaw;
-	float play_corr = 0, corr_diff;
-
+    /* 
+     * First phase: do full calculation for the first position 
+     */
+    if (ec->sum_play_level0 == 0) {
+	/* Buffer has just been filled up, do full calculation */
 	sum_play_level = 0;
-	for (j=i, end=i+ec->templ_cnt-1; j<end; ++j) {
+	play_corr = 0;
+	for (j=0; j<ec->templ_cnt-1; ++j) {
 	    float corr;
 	    corr = (float)ec->play_hist[j+1] / ec->play_hist[j];
 	    play_corr += corr;
 	    sum_play_level += ec->play_hist[j];
 	}
 	sum_play_level += ec->play_hist[j];
+	ec->sum_play_level0 = sum_play_level;
+	ec->play_corr0 = play_corr;
+    } else {
+	/* Update from previous calculation */
+	ec->sum_play_level0 = ec->sum_play_level0 - old_play_frm_level + 
+			      ec->play_hist[ec->templ_cnt-1];
+	ec->play_corr0 = ec->play_corr0 - ((float)ec->play_hist[0] / 
+					          old_play_frm_level) +
+		         ((float)ec->play_hist[ec->templ_cnt-1] /
+			         ec->play_hist[ec->templ_cnt-2]);
+	sum_play_level = ec->sum_play_level0;
+	play_corr = ec->play_corr0;
+    }
+    ec->tmp_corr[0] = FABS(play_corr - ec->rec_corr);
+    ec->tmp_factor[0] = (float)ec->sum_rec_level / sum_play_level;
+
+    /* Bail out if remote isn't talking */
+    ulaw = pjmedia_linear2ulaw(sum_play_level/ec->templ_cnt) ^ 0xFF;
+    if (ulaw < MIN_SIGNAL_ULAW) {
+	echo_supp_set_state(ec, ST_REM_SILENT, ulaw);
+	return;
+    }
+    /* Bail out if local user is talking */
+    if (ec->sum_rec_level > sum_play_level) {
+	echo_supp_set_state(ec, ST_LOCAL_TALK, ulaw);
+	return;
+    }
+
+    /*
+     * Second phase: do incremental calculation for the rest of positions
+     */
+    for (i=1; i < ec->tail_cnt; ++i) {
+	unsigned end;
+
+	end = i + ec->templ_cnt;
+
+	sum_play_level = sum_play_level - ec->play_hist[i-1] +
+			 ec->play_hist[end-1];
+	play_corr = play_corr - ((float)ec->play_hist[i]/ec->play_hist[i-1]) +
+		    ((float)ec->play_hist[end-1]/ec->play_hist[end-2]);
 
 	/* Bail out if remote isn't talking */
 	ulaw = pjmedia_linear2ulaw(sum_play_level/ec->templ_cnt) ^ 0xFF;
@@ -473,7 +543,7 @@
 	}
 
 	/* Bail out if local user is talking */
-	if (sum_rec_level >= sum_play_level) {
+	if (ec->sum_rec_level > sum_play_level) {
 	    echo_supp_set_state(ec, ST_LOCAL_TALK, ulaw);
 	    return;
 	}
@@ -481,7 +551,7 @@
 #if 0
 	// disabled: not a good idea if mic throws out loud echo
 	/* Also bail out if we suspect there's a doubletalk */
-	ulaw = pjmedia_linear2ulaw(sum_rec_level/ec->templ_cnt) ^ 0xFF;
+	ulaw = pjmedia_linear2ulaw(ec->sum_rec_level/ec->templ_cnt) ^ 0xFF;
 	if (ulaw > MIN_SIGNAL_ULAW) {
 	    echo_supp_set_state(ec, ST_DOUBLETALK, ulaw);
 	    return;
@@ -489,11 +559,10 @@
 #endif
 
 	/* Calculate correlation and save to temporary array */
-	corr_diff = FABS(play_corr - rec_corr);
-	ec->tmp_corr[i] = corr_diff;
+	ec->tmp_corr[i] = FABS(play_corr - ec->rec_corr);
 
 	/* Also calculate the gain factor between mic and speaker level */
-	ec->tmp_factor[i] = (float)sum_rec_level / sum_play_level;
+	ec->tmp_factor[i] = (float)ec->sum_rec_level / sum_play_level;
 	pj_assert(ec->tmp_factor[i] < 1);
     }
 
@@ -652,7 +721,7 @@
 		 */
 		factor = 1.0;
 		echo_supp_set_state(ec, ST_LOCAL_TALK, rec_level);
-	    } else if (rec_level >= play_level) {
+	    } else if (rec_level > play_level) {
 		/* Seems that both are talking. Scale the mic signal
 		 * down a little bit to reduce echo, while allowing both
 		 * parties to talk at the same time.
@@ -688,7 +757,7 @@
 	if (factor >= ec->last_factor)
 	    factor = (factor + ec->last_factor) / 2;
 	else
-	    factor = (factor + ec->last_factor*9) / 10;
+	    factor = (factor + ec->last_factor*19) / 20;
 
 	/* Amplify frame */
 	amplify_frame(rec_frm, ec->samples_per_frame, 
diff --git a/pjmedia/src/test/mips_test.c b/pjmedia/src/test/mips_test.c
index c0cd64e..4a4c4f5 100644
--- a/pjmedia/src/test/mips_test.c
+++ b/pjmedia/src/test/mips_test.c
@@ -51,14 +51,18 @@
 		ARMv7-M		Cortex-M3   1.250 MIPS/MHz
     */
 
-//#   define CPU_MHZ	    (2666)
-//#   define CPU_IPS	    (CPU_MHZ * MEGA * 3.039)	/* P4 2.6GHz	*/
+#   define CPU_MHZ	    (2666)
+#   define CPU_IPS	    (3.039 * CPU_MHZ * MEGA)	/* P4 2.6GHz	*/
 
-#   define CPU_MHZ	    700
-#   define CPU_IPS	    (700 * MEGA * 2.708)	/* P3 700Mhz	*/
+//#   define CPU_MHZ	    700
+//#   define CPU_IPS	    (700 * MEGA * 2.708)	/* P3 700Mhz	*/
 
 //#   define CPU_MHZ	    180
 //#   define CPU_IPS	    (CPU_MHZ * MEGA * 1.1)	/* ARM926EJ-S */
+
+//#   define CPU_MHZ	    312
+//#   define CPU_IPS	    (CPU_MHZ * MEGA * 1.282)	/* Dell Axim PDA */
+
 #endif
 
 
@@ -356,7 +360,7 @@
 };
 
 #define THIS_FILE	    "mips_test.c"
-#define DURATION	    1000
+#define DURATION	    5000
 #define PTIME		    20	/* MUST be 20! */
 #define MEGA		    1000000
 #define GIGA		    1000000000
@@ -1263,7 +1267,6 @@
     pjmedia_port *gen_port, *ec_port;
     pj_status_t status;
 
-    PJ_UNUSED_ARG(flags);
     PJ_UNUSED_ARG(te);
 
     gen_port = create_gen_port(pool, clock_rate, channel_count, 
@@ -1272,7 +1275,7 @@
 	return NULL;
 
     status = pjmedia_echo_port_create(pool, gen_port, ec_tail_msec, 0,
-				      0, &ec_port);
+				      flags, &ec_port);
     if (status != PJ_SUCCESS)
 	return NULL;
 
@@ -1287,6 +1290,7 @@
 				   unsigned flags,
 				   struct test_entry *te)
 {
+    flags = 0;
     return ec_create(100, pool, clock_rate, channel_count, samples_per_frame,
 		     flags, te);
 }
@@ -1299,6 +1303,7 @@
 				   unsigned flags,
 				   struct test_entry *te)
 {
+    flags = 0;
     return ec_create(128, pool, clock_rate, channel_count, samples_per_frame,
 		     flags, te);
 }
@@ -1311,6 +1316,7 @@
 				   unsigned flags,
 				   struct test_entry *te)
 {
+    flags = 0;
     return ec_create(200, pool, clock_rate, channel_count, samples_per_frame,
 		     flags, te);
 }
@@ -1323,6 +1329,7 @@
 				   unsigned flags,
 				   struct test_entry *te)
 {
+    flags = 0;
     return ec_create(256, pool, clock_rate, channel_count, samples_per_frame,
 		     flags, te);
 }
@@ -1336,6 +1343,7 @@
 				   unsigned flags,
 				   struct test_entry *te)
 {
+    flags = 0;
     return ec_create(400, pool, clock_rate, channel_count, samples_per_frame,
 		     flags, te);
 }
@@ -1348,6 +1356,7 @@
 				   unsigned flags,
 				   struct test_entry *te)
 {
+    flags = 0;
     return ec_create(500, pool, clock_rate, channel_count, samples_per_frame,
 		     flags, te);
 }
@@ -1360,6 +1369,7 @@
 				   unsigned flags,
 				   struct test_entry *te)
 {
+    flags = 0;
     return ec_create(512, pool, clock_rate, channel_count, samples_per_frame,
 		     flags, te);
 }
@@ -1372,6 +1382,7 @@
 				   unsigned flags,
 				   struct test_entry *te)
 {
+    flags = 0;
     return ec_create(600, pool, clock_rate, channel_count, samples_per_frame,
 		     flags, te);
 }
@@ -1384,6 +1395,127 @@
 				   unsigned flags,
 				   struct test_entry *te)
 {
+    flags = 0;
+    return ec_create(800, pool, clock_rate, channel_count, samples_per_frame,
+		     flags, te);
+}
+
+
+
+/* Echo suppressor with 100ms tail length */
+static pjmedia_port* es_create_100(pj_pool_t *pool,
+				   unsigned clock_rate,
+				   unsigned channel_count,
+				   unsigned samples_per_frame,
+				   unsigned flags,
+				   struct test_entry *te)
+{
+    flags = PJMEDIA_ECHO_SIMPLE;
+    return ec_create(100, pool, clock_rate, channel_count, samples_per_frame,
+		     flags, te);
+}
+
+/* Echo suppressor with 128ms tail length */
+static pjmedia_port* es_create_128(pj_pool_t *pool,
+				   unsigned clock_rate,
+				   unsigned channel_count,
+				   unsigned samples_per_frame,
+				   unsigned flags,
+				   struct test_entry *te)
+{
+    flags = PJMEDIA_ECHO_SIMPLE;
+    return ec_create(128, pool, clock_rate, channel_count, samples_per_frame,
+		     flags, te);
+}
+
+/* Echo suppressor with 200ms tail length */
+static pjmedia_port* es_create_200(pj_pool_t *pool,
+				   unsigned clock_rate,
+				   unsigned channel_count,
+				   unsigned samples_per_frame,
+				   unsigned flags,
+				   struct test_entry *te)
+{
+    flags = PJMEDIA_ECHO_SIMPLE;
+    return ec_create(200, pool, clock_rate, channel_count, samples_per_frame,
+		     flags, te);
+}
+
+/* Echo suppressor with 256ms tail length */
+static pjmedia_port* es_create_256(pj_pool_t *pool,
+				   unsigned clock_rate,
+				   unsigned channel_count,
+				   unsigned samples_per_frame,
+				   unsigned flags,
+				   struct test_entry *te)
+{
+    flags = PJMEDIA_ECHO_SIMPLE;
+    return ec_create(256, pool, clock_rate, channel_count, samples_per_frame,
+		     flags, te);
+}
+
+
+/* Echo suppressor with 400ms tail length */
+static pjmedia_port* es_create_400(pj_pool_t *pool,
+				   unsigned clock_rate,
+				   unsigned channel_count,
+				   unsigned samples_per_frame,
+				   unsigned flags,
+				   struct test_entry *te)
+{
+    flags = PJMEDIA_ECHO_SIMPLE;
+    return ec_create(400, pool, clock_rate, channel_count, samples_per_frame,
+		     flags, te);
+}
+
+/* Echo suppressor with 500ms tail length */
+static pjmedia_port* es_create_500(pj_pool_t *pool,
+				   unsigned clock_rate,
+				   unsigned channel_count,
+				   unsigned samples_per_frame,
+				   unsigned flags,
+				   struct test_entry *te)
+{
+    flags = PJMEDIA_ECHO_SIMPLE;
+    return ec_create(500, pool, clock_rate, channel_count, samples_per_frame,
+		     flags, te);
+}
+
+/* Echo suppressor with 512ms tail length */
+static pjmedia_port* es_create_512(pj_pool_t *pool,
+				   unsigned clock_rate,
+				   unsigned channel_count,
+				   unsigned samples_per_frame,
+				   unsigned flags,
+				   struct test_entry *te)
+{
+    flags = PJMEDIA_ECHO_SIMPLE;
+    return ec_create(512, pool, clock_rate, channel_count, samples_per_frame,
+		     flags, te);
+}
+
+/* Echo suppressor with 600ms tail length */
+static pjmedia_port* es_create_600(pj_pool_t *pool,
+				   unsigned clock_rate,
+				   unsigned channel_count,
+				   unsigned samples_per_frame,
+				   unsigned flags,
+				   struct test_entry *te)
+{
+    flags = PJMEDIA_ECHO_SIMPLE;
+    return ec_create(600, pool, clock_rate, channel_count, samples_per_frame,
+		     flags, te);
+}
+
+/* Echo suppressor with 800ms tail length */
+static pjmedia_port* es_create_800(pj_pool_t *pool,
+				   unsigned clock_rate,
+				   unsigned channel_count,
+				   unsigned samples_per_frame,
+				   unsigned flags,
+				   struct test_entry *te)
+{
+    flags = PJMEDIA_ECHO_SIMPLE;
     return ec_create(800, pool, clock_rate, channel_count, samples_per_frame,
 		     flags, te);
 }
@@ -2128,6 +2260,15 @@
 	{ "echo canceller 512ms tail len", OP_GET_PUT, K8|K16, &ec_create_512},
 	{ "echo canceller 600ms tail len", OP_GET_PUT, K8|K16, &ec_create_600},
 	{ "echo canceller 800ms tail len", OP_GET_PUT, K8|K16, &ec_create_800},
+	{ "echo suppressor 100ms tail len", OP_GET_PUT, K8|K16, &es_create_100},
+	{ "echo suppressor 128ms tail len", OP_GET_PUT, K8|K16, &es_create_128},
+	{ "echo suppressor 200ms tail len", OP_GET_PUT, K8|K16, &es_create_200},
+	{ "echo suppressor 256ms tail len", OP_GET_PUT, K8|K16, &es_create_256},
+	{ "echo suppressor 400ms tail len", OP_GET_PUT, K8|K16, &es_create_400},
+	{ "echo suppressor 500ms tail len", OP_GET_PUT, K8|K16, &es_create_500},
+	{ "echo suppressor 512ms tail len", OP_GET_PUT, K8|K16, &es_create_512},
+	{ "echo suppressor 600ms tail len", OP_GET_PUT, K8|K16, &es_create_600},
+	{ "echo suppressor 800ms tail len", OP_GET_PUT, K8|K16, &es_create_800},
 	{ "tone generator with single freq", OP_GET, K8|K16, &create_tonegen1},
 	{ "tone generator with dual freq", OP_GET, K8|K16, &create_tonegen2},
 	{ "codec encode/decode - G.711", OP_PUT, K8, &g711_encode_decode},
@@ -2195,6 +2336,8 @@
 	    usec = (pj_elapsed_usec(&tzero, &times[0]) + 
 		    pj_elapsed_usec(&tzero, &times[1])) / 2;
 
+	    usec = usec / (DURATION / 1000);
+
 	    mips = (float)(CPU_IPS * usec / 1000000.0 / 1000000);
 	    cpu_pct = (float)(100.0 * usec / 1000000);
 	    PJ_LOG(3,(THIS_FILE, "%2dKHz %-38s % 8d %8.3f %7.2f",